]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - drivers/misc/habanalabs/firmware_if.c
habanalabs: move event handling to common firmware file
[mirror_ubuntu-jammy-kernel.git] / drivers / misc / habanalabs / firmware_if.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 */
7
8 #include "habanalabs.h"
9 #include "include/hl_boot_if.h"
10
11 #include <linux/firmware.h>
12 #include <linux/genalloc.h>
13 #include <linux/io-64-nonatomic-lo-hi.h>
14 #include <linux/slab.h>
15
16 /**
17 * hl_fw_load_fw_to_device() - Load F/W code to device's memory.
18 * @hdev: pointer to hl_device structure.
19 *
20 * Copy fw code from firmware file to device memory.
21 *
22 * Return: 0 on success, non-zero for failure.
23 */
24 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
25 void __iomem *dst)
26 {
27 const struct firmware *fw;
28 const u64 *fw_data;
29 size_t fw_size;
30 int rc;
31
32 rc = request_firmware(&fw, fw_name, hdev->dev);
33 if (rc) {
34 dev_err(hdev->dev, "Firmware file %s is not found!\n", fw_name);
35 goto out;
36 }
37
38 fw_size = fw->size;
39 if ((fw_size % 4) != 0) {
40 dev_err(hdev->dev, "Illegal %s firmware size %zu\n",
41 fw_name, fw_size);
42 rc = -EINVAL;
43 goto out;
44 }
45
46 dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size);
47
48 fw_data = (const u64 *) fw->data;
49
50 memcpy_toio(dst, fw_data, fw_size);
51
52 out:
53 release_firmware(fw);
54 return rc;
55 }
56
57 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
58 {
59 struct armcp_packet pkt = {};
60
61 pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);
62
63 return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
64 sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
65 }
66
67 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
68 u16 len, u32 timeout, long *result)
69 {
70 struct armcp_packet *pkt;
71 dma_addr_t pkt_dma_addr;
72 u32 tmp;
73 int rc = 0;
74
75 pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
76 &pkt_dma_addr);
77 if (!pkt) {
78 dev_err(hdev->dev,
79 "Failed to allocate DMA memory for packet to CPU\n");
80 return -ENOMEM;
81 }
82
83 memcpy(pkt, msg, len);
84
85 mutex_lock(&hdev->send_cpu_message_lock);
86
87 if (hdev->disabled)
88 goto out;
89
90 if (hdev->device_cpu_disabled) {
91 rc = -EIO;
92 goto out;
93 }
94
95 rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
96 if (rc) {
97 dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
98 goto out;
99 }
100
101 rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
102 (tmp == ARMCP_PACKET_FENCE_VAL), 1000,
103 timeout, true);
104
105 hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
106
107 if (rc == -ETIMEDOUT) {
108 dev_err(hdev->dev, "Device CPU packet timeout (0x%x)\n", tmp);
109 hdev->device_cpu_disabled = true;
110 goto out;
111 }
112
113 tmp = le32_to_cpu(pkt->ctl);
114
115 rc = (tmp & ARMCP_PKT_CTL_RC_MASK) >> ARMCP_PKT_CTL_RC_SHIFT;
116 if (rc) {
117 dev_err(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
118 rc,
119 (tmp & ARMCP_PKT_CTL_OPCODE_MASK)
120 >> ARMCP_PKT_CTL_OPCODE_SHIFT);
121 rc = -EIO;
122 } else if (result) {
123 *result = (long) le64_to_cpu(pkt->result);
124 }
125
126 out:
127 mutex_unlock(&hdev->send_cpu_message_lock);
128
129 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, len, pkt);
130
131 return rc;
132 }
133
134 int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
135 {
136 struct armcp_packet pkt;
137 long result;
138 int rc;
139
140 memset(&pkt, 0, sizeof(pkt));
141
142 pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ <<
143 ARMCP_PKT_CTL_OPCODE_SHIFT);
144 pkt.value = cpu_to_le64(event_type);
145
146 rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
147 HL_DEVICE_TIMEOUT_USEC, &result);
148
149 if (rc)
150 dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
151
152 return rc;
153 }
154
155 int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
156 size_t irq_arr_size)
157 {
158 struct armcp_unmask_irq_arr_packet *pkt;
159 size_t total_pkt_size;
160 long result;
161 int rc;
162
163 total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) +
164 irq_arr_size;
165
166 /* data should be aligned to 8 bytes in order to ArmCP to copy it */
167 total_pkt_size = (total_pkt_size + 0x7) & ~0x7;
168
169 /* total_pkt_size is casted to u16 later on */
170 if (total_pkt_size > USHRT_MAX) {
171 dev_err(hdev->dev, "too many elements in IRQ array\n");
172 return -EINVAL;
173 }
174
175 pkt = kzalloc(total_pkt_size, GFP_KERNEL);
176 if (!pkt)
177 return -ENOMEM;
178
179 pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0]));
180 memcpy(&pkt->irqs, irq_arr, irq_arr_size);
181
182 pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
183 ARMCP_PKT_CTL_OPCODE_SHIFT);
184
185 rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
186 total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result);
187
188 if (rc)
189 dev_err(hdev->dev, "failed to unmask IRQ array\n");
190
191 kfree(pkt);
192
193 return rc;
194 }
195
196 int hl_fw_test_cpu_queue(struct hl_device *hdev)
197 {
198 struct armcp_packet test_pkt = {};
199 long result;
200 int rc;
201
202 test_pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEST <<
203 ARMCP_PKT_CTL_OPCODE_SHIFT);
204 test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
205
206 rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
207 sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
208
209 if (!rc) {
210 if (result != ARMCP_PACKET_FENCE_VAL)
211 dev_err(hdev->dev,
212 "CPU queue test failed (0x%08lX)\n", result);
213 } else {
214 dev_err(hdev->dev, "CPU queue test failed, error %d\n", rc);
215 }
216
217 return rc;
218 }
219
220 void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
221 dma_addr_t *dma_handle)
222 {
223 u64 kernel_addr;
224
225 kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size);
226
227 *dma_handle = hdev->cpu_accessible_dma_address +
228 (kernel_addr - (u64) (uintptr_t) hdev->cpu_accessible_dma_mem);
229
230 return (void *) (uintptr_t) kernel_addr;
231 }
232
233 void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
234 void *vaddr)
235 {
236 gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) (uintptr_t) vaddr,
237 size);
238 }
239
240 int hl_fw_send_heartbeat(struct hl_device *hdev)
241 {
242 struct armcp_packet hb_pkt = {};
243 long result;
244 int rc;
245
246 hb_pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEST <<
247 ARMCP_PKT_CTL_OPCODE_SHIFT);
248 hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
249
250 rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
251 sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
252
253 if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
254 rc = -EIO;
255
256 return rc;
257 }
258
259 int hl_fw_armcp_info_get(struct hl_device *hdev)
260 {
261 struct asic_fixed_properties *prop = &hdev->asic_prop;
262 struct armcp_packet pkt = {};
263 void *armcp_info_cpu_addr;
264 dma_addr_t armcp_info_dma_addr;
265 long result;
266 int rc;
267
268 armcp_info_cpu_addr =
269 hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
270 sizeof(struct armcp_info),
271 &armcp_info_dma_addr);
272 if (!armcp_info_cpu_addr) {
273 dev_err(hdev->dev,
274 "Failed to allocate DMA memory for ArmCP info packet\n");
275 return -ENOMEM;
276 }
277
278 memset(armcp_info_cpu_addr, 0, sizeof(struct armcp_info));
279
280 pkt.ctl = cpu_to_le32(ARMCP_PACKET_INFO_GET <<
281 ARMCP_PKT_CTL_OPCODE_SHIFT);
282 pkt.addr = cpu_to_le64(armcp_info_dma_addr);
283 pkt.data_max_size = cpu_to_le32(sizeof(struct armcp_info));
284
285 rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
286 HL_ARMCP_INFO_TIMEOUT_USEC, &result);
287 if (rc) {
288 dev_err(hdev->dev,
289 "Failed to send ArmCP info pkt, error %d\n", rc);
290 goto out;
291 }
292
293 memcpy(&prop->armcp_info, armcp_info_cpu_addr,
294 sizeof(prop->armcp_info));
295
296 rc = hl_build_hwmon_channel_info(hdev, prop->armcp_info.sensors);
297 if (rc) {
298 dev_err(hdev->dev,
299 "Failed to build hwmon channel info, error %d\n", rc);
300 rc = -EFAULT;
301 goto out;
302 }
303
304 out:
305 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
306 sizeof(struct armcp_info), armcp_info_cpu_addr);
307
308 return rc;
309 }
310
311 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
312 {
313 struct armcp_packet pkt = {};
314 void *eeprom_info_cpu_addr;
315 dma_addr_t eeprom_info_dma_addr;
316 long result;
317 int rc;
318
319 eeprom_info_cpu_addr =
320 hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
321 max_size, &eeprom_info_dma_addr);
322 if (!eeprom_info_cpu_addr) {
323 dev_err(hdev->dev,
324 "Failed to allocate DMA memory for ArmCP EEPROM packet\n");
325 return -ENOMEM;
326 }
327
328 memset(eeprom_info_cpu_addr, 0, max_size);
329
330 pkt.ctl = cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET <<
331 ARMCP_PKT_CTL_OPCODE_SHIFT);
332 pkt.addr = cpu_to_le64(eeprom_info_dma_addr);
333 pkt.data_max_size = cpu_to_le32(max_size);
334
335 rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
336 HL_ARMCP_EEPROM_TIMEOUT_USEC, &result);
337
338 if (rc) {
339 dev_err(hdev->dev,
340 "Failed to send ArmCP EEPROM packet, error %d\n", rc);
341 goto out;
342 }
343
344 /* result contains the actual size */
345 memcpy(data, eeprom_info_cpu_addr, min((size_t)result, max_size));
346
347 out:
348 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, max_size,
349 eeprom_info_cpu_addr);
350
351 return rc;
352 }
353
354 static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
355 {
356 u32 err_val;
357
358 /* Some of the firmware status codes are deprecated in newer f/w
359 * versions. In those versions, the errors are reported
360 * in different registers. Therefore, we need to check those
361 * registers and print the exact errors. Moreover, there
362 * may be multiple errors, so we need to report on each error
363 * separately. Some of the error codes might indicate a state
364 * that is not an error per-se, but it is an error in production
365 * environment
366 */
367 err_val = RREG32(boot_err0_reg);
368 if (!(err_val & CPU_BOOT_ERR0_ENABLED))
369 return;
370
371 if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
372 dev_err(hdev->dev,
373 "Device boot error - DRAM initialization failed\n");
374 if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
375 dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
376 if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
377 dev_err(hdev->dev,
378 "Device boot error - Thermal Sensor initialization failed\n");
379 if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
380 dev_warn(hdev->dev,
381 "Device boot warning - Skipped DRAM initialization\n");
382 if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED)
383 dev_warn(hdev->dev,
384 "Device boot error - Skipped waiting for BMC\n");
385 if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
386 dev_err(hdev->dev,
387 "Device boot error - Serdes data from BMC not available\n");
388 if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
389 dev_err(hdev->dev,
390 "Device boot error - NIC F/W initialization failed\n");
391 }
392
393 int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
394 u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
395 u32 boot_err0_reg, bool skip_bmc,
396 u32 cpu_timeout, u32 boot_fit_timeout)
397 {
398 u32 status;
399 int rc;
400
401 dev_info(hdev->dev, "Going to wait for device boot (up to %lds)\n",
402 cpu_timeout / USEC_PER_SEC);
403
404 /* Wait for boot FIT request */
405 rc = hl_poll_timeout(
406 hdev,
407 cpu_boot_status_reg,
408 status,
409 status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT,
410 10000,
411 boot_fit_timeout);
412
413 if (rc) {
414 dev_dbg(hdev->dev,
415 "No boot fit request received, resuming boot\n");
416 } else {
417 rc = hdev->asic_funcs->load_boot_fit_to_device(hdev);
418 if (rc)
419 goto out;
420
421 /* Clear device CPU message status */
422 WREG32(cpu_msg_status_reg, CPU_MSG_CLR);
423
424 /* Signal device CPU that boot loader is ready */
425 WREG32(msg_to_cpu_reg, KMD_MSG_FIT_RDY);
426
427 /* Poll for CPU device ack */
428 rc = hl_poll_timeout(
429 hdev,
430 cpu_msg_status_reg,
431 status,
432 status == CPU_MSG_OK,
433 10000,
434 boot_fit_timeout);
435
436 if (rc) {
437 dev_err(hdev->dev,
438 "Timeout waiting for boot fit load ack\n");
439 goto out;
440 }
441
442 /* Clear message */
443 WREG32(msg_to_cpu_reg, KMD_MSG_NA);
444 }
445
446 /* Make sure CPU boot-loader is running */
447 rc = hl_poll_timeout(
448 hdev,
449 cpu_boot_status_reg,
450 status,
451 (status == CPU_BOOT_STATUS_DRAM_RDY) ||
452 (status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
453 (status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
454 (status == CPU_BOOT_STATUS_SRAM_AVAIL),
455 10000,
456 cpu_timeout);
457
458 /* Read U-Boot, preboot versions now in case we will later fail */
459 hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_UBOOT);
460 hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
461
462 /* Some of the status codes below are deprecated in newer f/w
463 * versions but we keep them here for backward compatibility
464 */
465 if (rc) {
466 switch (status) {
467 case CPU_BOOT_STATUS_NA:
468 dev_err(hdev->dev,
469 "Device boot error - BTL did NOT run\n");
470 break;
471 case CPU_BOOT_STATUS_IN_WFE:
472 dev_err(hdev->dev,
473 "Device boot error - Stuck inside WFE loop\n");
474 break;
475 case CPU_BOOT_STATUS_IN_BTL:
476 dev_err(hdev->dev,
477 "Device boot error - Stuck in BTL\n");
478 break;
479 case CPU_BOOT_STATUS_IN_PREBOOT:
480 dev_err(hdev->dev,
481 "Device boot error - Stuck in Preboot\n");
482 break;
483 case CPU_BOOT_STATUS_IN_SPL:
484 dev_err(hdev->dev,
485 "Device boot error - Stuck in SPL\n");
486 break;
487 case CPU_BOOT_STATUS_IN_UBOOT:
488 dev_err(hdev->dev,
489 "Device boot error - Stuck in u-boot\n");
490 break;
491 case CPU_BOOT_STATUS_DRAM_INIT_FAIL:
492 dev_err(hdev->dev,
493 "Device boot error - DRAM initialization failed\n");
494 break;
495 case CPU_BOOT_STATUS_UBOOT_NOT_READY:
496 dev_err(hdev->dev,
497 "Device boot error - u-boot stopped by user\n");
498 break;
499 case CPU_BOOT_STATUS_TS_INIT_FAIL:
500 dev_err(hdev->dev,
501 "Device boot error - Thermal Sensor initialization failed\n");
502 break;
503 default:
504 dev_err(hdev->dev,
505 "Device boot error - Invalid status code %d\n",
506 status);
507 break;
508 }
509
510 rc = -EIO;
511 goto out;
512 }
513
514 if (!hdev->fw_loading) {
515 dev_info(hdev->dev, "Skip loading FW\n");
516 goto out;
517 }
518
519 if (status == CPU_BOOT_STATUS_SRAM_AVAIL)
520 goto out;
521
522 dev_info(hdev->dev,
523 "Loading firmware to device, may take some time...\n");
524
525 rc = hdev->asic_funcs->load_firmware_to_device(hdev);
526 if (rc)
527 goto out;
528
529 if (skip_bmc) {
530 WREG32(msg_to_cpu_reg, KMD_MSG_SKIP_BMC);
531
532 rc = hl_poll_timeout(
533 hdev,
534 cpu_boot_status_reg,
535 status,
536 (status == CPU_BOOT_STATUS_BMC_WAITING_SKIPPED),
537 10000,
538 cpu_timeout);
539
540 if (rc) {
541 dev_err(hdev->dev,
542 "Failed to get ACK on skipping BMC, %d\n",
543 status);
544 WREG32(msg_to_cpu_reg, KMD_MSG_NA);
545 rc = -EIO;
546 goto out;
547 }
548 }
549
550 WREG32(msg_to_cpu_reg, KMD_MSG_FIT_RDY);
551
552 rc = hl_poll_timeout(
553 hdev,
554 cpu_boot_status_reg,
555 status,
556 (status == CPU_BOOT_STATUS_SRAM_AVAIL),
557 10000,
558 cpu_timeout);
559
560 /* Clear message */
561 WREG32(msg_to_cpu_reg, KMD_MSG_NA);
562
563 if (rc) {
564 if (status == CPU_BOOT_STATUS_FIT_CORRUPTED)
565 dev_err(hdev->dev,
566 "Device reports FIT image is corrupted\n");
567 else
568 dev_err(hdev->dev,
569 "Device failed to load, %d\n", status);
570
571 rc = -EIO;
572 goto out;
573 }
574
575 dev_info(hdev->dev, "Successfully loaded firmware to device\n");
576
577 out:
578 fw_read_errors(hdev, boot_err0_reg);
579
580 return rc;
581 }