1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright 2016-2019 HabanaLabs, Ltd.
8 #include "habanalabs.h"
9 #include "include/hl_boot_if.h"
11 #include <linux/firmware.h>
12 #include <linux/genalloc.h>
13 #include <linux/io-64-nonatomic-lo-hi.h>
14 #include <linux/slab.h>
17 * hl_fw_load_fw_to_device() - Load F/W code to device's memory.
18 * @hdev: pointer to hl_device structure.
20 * Copy fw code from firmware file to device memory.
22 * Return: 0 on success, non-zero for failure.
24 int hl_fw_load_fw_to_device(struct hl_device
*hdev
, const char *fw_name
,
27 const struct firmware
*fw
;
32 rc
= request_firmware(&fw
, fw_name
, hdev
->dev
);
34 dev_err(hdev
->dev
, "Firmware file %s is not found!\n", fw_name
);
39 if ((fw_size
% 4) != 0) {
40 dev_err(hdev
->dev
, "Illegal %s firmware size %zu\n",
46 dev_dbg(hdev
->dev
, "%s firmware size == %zu\n", fw_name
, fw_size
);
48 fw_data
= (const u64
*) fw
->data
;
50 memcpy_toio(dst
, fw_data
, fw_size
);
57 int hl_fw_send_pci_access_msg(struct hl_device
*hdev
, u32 opcode
)
59 struct armcp_packet pkt
= {};
61 pkt
.ctl
= cpu_to_le32(opcode
<< ARMCP_PKT_CTL_OPCODE_SHIFT
);
63 return hdev
->asic_funcs
->send_cpu_message(hdev
, (u32
*) &pkt
,
64 sizeof(pkt
), HL_DEVICE_TIMEOUT_USEC
, NULL
);
67 int hl_fw_send_cpu_message(struct hl_device
*hdev
, u32 hw_queue_id
, u32
*msg
,
68 u16 len
, u32 timeout
, long *result
)
70 struct armcp_packet
*pkt
;
71 dma_addr_t pkt_dma_addr
;
75 pkt
= hdev
->asic_funcs
->cpu_accessible_dma_pool_alloc(hdev
, len
,
79 "Failed to allocate DMA memory for packet to CPU\n");
83 memcpy(pkt
, msg
, len
);
85 mutex_lock(&hdev
->send_cpu_message_lock
);
90 if (hdev
->device_cpu_disabled
) {
95 rc
= hl_hw_queue_send_cb_no_cmpl(hdev
, hw_queue_id
, len
, pkt_dma_addr
);
97 dev_err(hdev
->dev
, "Failed to send CB on CPU PQ (%d)\n", rc
);
101 rc
= hl_poll_timeout_memory(hdev
, &pkt
->fence
, tmp
,
102 (tmp
== ARMCP_PACKET_FENCE_VAL
), 1000,
105 hl_hw_queue_inc_ci_kernel(hdev
, hw_queue_id
);
107 if (rc
== -ETIMEDOUT
) {
108 dev_err(hdev
->dev
, "Device CPU packet timeout (0x%x)\n", tmp
);
109 hdev
->device_cpu_disabled
= true;
113 tmp
= le32_to_cpu(pkt
->ctl
);
115 rc
= (tmp
& ARMCP_PKT_CTL_RC_MASK
) >> ARMCP_PKT_CTL_RC_SHIFT
;
117 dev_err(hdev
->dev
, "F/W ERROR %d for CPU packet %d\n",
119 (tmp
& ARMCP_PKT_CTL_OPCODE_MASK
)
120 >> ARMCP_PKT_CTL_OPCODE_SHIFT
);
123 *result
= (long) le64_to_cpu(pkt
->result
);
127 mutex_unlock(&hdev
->send_cpu_message_lock
);
129 hdev
->asic_funcs
->cpu_accessible_dma_pool_free(hdev
, len
, pkt
);
134 int hl_fw_unmask_irq(struct hl_device
*hdev
, u16 event_type
)
136 struct armcp_packet pkt
;
140 memset(&pkt
, 0, sizeof(pkt
));
142 pkt
.ctl
= cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ
<<
143 ARMCP_PKT_CTL_OPCODE_SHIFT
);
144 pkt
.value
= cpu_to_le64(event_type
);
146 rc
= hdev
->asic_funcs
->send_cpu_message(hdev
, (u32
*) &pkt
, sizeof(pkt
),
147 HL_DEVICE_TIMEOUT_USEC
, &result
);
150 dev_err(hdev
->dev
, "failed to unmask RAZWI IRQ %d", event_type
);
155 int hl_fw_unmask_irq_arr(struct hl_device
*hdev
, const u32
*irq_arr
,
158 struct armcp_unmask_irq_arr_packet
*pkt
;
159 size_t total_pkt_size
;
163 total_pkt_size
= sizeof(struct armcp_unmask_irq_arr_packet
) +
166 /* data should be aligned to 8 bytes in order to ArmCP to copy it */
167 total_pkt_size
= (total_pkt_size
+ 0x7) & ~0x7;
169 /* total_pkt_size is casted to u16 later on */
170 if (total_pkt_size
> USHRT_MAX
) {
171 dev_err(hdev
->dev
, "too many elements in IRQ array\n");
175 pkt
= kzalloc(total_pkt_size
, GFP_KERNEL
);
179 pkt
->length
= cpu_to_le32(irq_arr_size
/ sizeof(irq_arr
[0]));
180 memcpy(&pkt
->irqs
, irq_arr
, irq_arr_size
);
182 pkt
->armcp_pkt
.ctl
= cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY
<<
183 ARMCP_PKT_CTL_OPCODE_SHIFT
);
185 rc
= hdev
->asic_funcs
->send_cpu_message(hdev
, (u32
*) pkt
,
186 total_pkt_size
, HL_DEVICE_TIMEOUT_USEC
, &result
);
189 dev_err(hdev
->dev
, "failed to unmask IRQ array\n");
196 int hl_fw_test_cpu_queue(struct hl_device
*hdev
)
198 struct armcp_packet test_pkt
= {};
202 test_pkt
.ctl
= cpu_to_le32(ARMCP_PACKET_TEST
<<
203 ARMCP_PKT_CTL_OPCODE_SHIFT
);
204 test_pkt
.value
= cpu_to_le64(ARMCP_PACKET_FENCE_VAL
);
206 rc
= hdev
->asic_funcs
->send_cpu_message(hdev
, (u32
*) &test_pkt
,
207 sizeof(test_pkt
), HL_DEVICE_TIMEOUT_USEC
, &result
);
210 if (result
!= ARMCP_PACKET_FENCE_VAL
)
212 "CPU queue test failed (0x%08lX)\n", result
);
214 dev_err(hdev
->dev
, "CPU queue test failed, error %d\n", rc
);
220 void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device
*hdev
, size_t size
,
221 dma_addr_t
*dma_handle
)
225 kernel_addr
= gen_pool_alloc(hdev
->cpu_accessible_dma_pool
, size
);
227 *dma_handle
= hdev
->cpu_accessible_dma_address
+
228 (kernel_addr
- (u64
) (uintptr_t) hdev
->cpu_accessible_dma_mem
);
230 return (void *) (uintptr_t) kernel_addr
;
233 void hl_fw_cpu_accessible_dma_pool_free(struct hl_device
*hdev
, size_t size
,
236 gen_pool_free(hdev
->cpu_accessible_dma_pool
, (u64
) (uintptr_t) vaddr
,
240 int hl_fw_send_heartbeat(struct hl_device
*hdev
)
242 struct armcp_packet hb_pkt
= {};
246 hb_pkt
.ctl
= cpu_to_le32(ARMCP_PACKET_TEST
<<
247 ARMCP_PKT_CTL_OPCODE_SHIFT
);
248 hb_pkt
.value
= cpu_to_le64(ARMCP_PACKET_FENCE_VAL
);
250 rc
= hdev
->asic_funcs
->send_cpu_message(hdev
, (u32
*) &hb_pkt
,
251 sizeof(hb_pkt
), HL_DEVICE_TIMEOUT_USEC
, &result
);
253 if ((rc
) || (result
!= ARMCP_PACKET_FENCE_VAL
))
259 int hl_fw_armcp_info_get(struct hl_device
*hdev
)
261 struct asic_fixed_properties
*prop
= &hdev
->asic_prop
;
262 struct armcp_packet pkt
= {};
263 void *armcp_info_cpu_addr
;
264 dma_addr_t armcp_info_dma_addr
;
268 armcp_info_cpu_addr
=
269 hdev
->asic_funcs
->cpu_accessible_dma_pool_alloc(hdev
,
270 sizeof(struct armcp_info
),
271 &armcp_info_dma_addr
);
272 if (!armcp_info_cpu_addr
) {
274 "Failed to allocate DMA memory for ArmCP info packet\n");
278 memset(armcp_info_cpu_addr
, 0, sizeof(struct armcp_info
));
280 pkt
.ctl
= cpu_to_le32(ARMCP_PACKET_INFO_GET
<<
281 ARMCP_PKT_CTL_OPCODE_SHIFT
);
282 pkt
.addr
= cpu_to_le64(armcp_info_dma_addr
);
283 pkt
.data_max_size
= cpu_to_le32(sizeof(struct armcp_info
));
285 rc
= hdev
->asic_funcs
->send_cpu_message(hdev
, (u32
*) &pkt
, sizeof(pkt
),
286 HL_ARMCP_INFO_TIMEOUT_USEC
, &result
);
289 "Failed to send ArmCP info pkt, error %d\n", rc
);
293 memcpy(&prop
->armcp_info
, armcp_info_cpu_addr
,
294 sizeof(prop
->armcp_info
));
296 rc
= hl_build_hwmon_channel_info(hdev
, prop
->armcp_info
.sensors
);
299 "Failed to build hwmon channel info, error %d\n", rc
);
305 hdev
->asic_funcs
->cpu_accessible_dma_pool_free(hdev
,
306 sizeof(struct armcp_info
), armcp_info_cpu_addr
);
311 int hl_fw_get_eeprom_data(struct hl_device
*hdev
, void *data
, size_t max_size
)
313 struct armcp_packet pkt
= {};
314 void *eeprom_info_cpu_addr
;
315 dma_addr_t eeprom_info_dma_addr
;
319 eeprom_info_cpu_addr
=
320 hdev
->asic_funcs
->cpu_accessible_dma_pool_alloc(hdev
,
321 max_size
, &eeprom_info_dma_addr
);
322 if (!eeprom_info_cpu_addr
) {
324 "Failed to allocate DMA memory for ArmCP EEPROM packet\n");
328 memset(eeprom_info_cpu_addr
, 0, max_size
);
330 pkt
.ctl
= cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET
<<
331 ARMCP_PKT_CTL_OPCODE_SHIFT
);
332 pkt
.addr
= cpu_to_le64(eeprom_info_dma_addr
);
333 pkt
.data_max_size
= cpu_to_le32(max_size
);
335 rc
= hdev
->asic_funcs
->send_cpu_message(hdev
, (u32
*) &pkt
, sizeof(pkt
),
336 HL_ARMCP_EEPROM_TIMEOUT_USEC
, &result
);
340 "Failed to send ArmCP EEPROM packet, error %d\n", rc
);
344 /* result contains the actual size */
345 memcpy(data
, eeprom_info_cpu_addr
, min((size_t)result
, max_size
));
348 hdev
->asic_funcs
->cpu_accessible_dma_pool_free(hdev
, max_size
,
349 eeprom_info_cpu_addr
);
354 static void fw_read_errors(struct hl_device
*hdev
, u32 boot_err0_reg
)
358 /* Some of the firmware status codes are deprecated in newer f/w
359 * versions. In those versions, the errors are reported
360 * in different registers. Therefore, we need to check those
361 * registers and print the exact errors. Moreover, there
362 * may be multiple errors, so we need to report on each error
363 * separately. Some of the error codes might indicate a state
364 * that is not an error per-se, but it is an error in production
367 err_val
= RREG32(boot_err0_reg
);
368 if (!(err_val
& CPU_BOOT_ERR0_ENABLED
))
371 if (err_val
& CPU_BOOT_ERR0_DRAM_INIT_FAIL
)
373 "Device boot error - DRAM initialization failed\n");
374 if (err_val
& CPU_BOOT_ERR0_FIT_CORRUPTED
)
375 dev_err(hdev
->dev
, "Device boot error - FIT image corrupted\n");
376 if (err_val
& CPU_BOOT_ERR0_TS_INIT_FAIL
)
378 "Device boot error - Thermal Sensor initialization failed\n");
379 if (err_val
& CPU_BOOT_ERR0_DRAM_SKIPPED
)
381 "Device boot warning - Skipped DRAM initialization\n");
382 if (err_val
& CPU_BOOT_ERR0_BMC_WAIT_SKIPPED
)
384 "Device boot error - Skipped waiting for BMC\n");
385 if (err_val
& CPU_BOOT_ERR0_NIC_DATA_NOT_RDY
)
387 "Device boot error - Serdes data from BMC not available\n");
388 if (err_val
& CPU_BOOT_ERR0_NIC_FW_FAIL
)
390 "Device boot error - NIC F/W initialization failed\n");
393 int hl_fw_init_cpu(struct hl_device
*hdev
, u32 cpu_boot_status_reg
,
394 u32 msg_to_cpu_reg
, u32 cpu_msg_status_reg
,
395 u32 boot_err0_reg
, bool skip_bmc
,
396 u32 cpu_timeout
, u32 boot_fit_timeout
)
401 dev_info(hdev
->dev
, "Going to wait for device boot (up to %lds)\n",
402 cpu_timeout
/ USEC_PER_SEC
);
404 /* Wait for boot FIT request */
405 rc
= hl_poll_timeout(
409 status
== CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT
,
415 "No boot fit request received, resuming boot\n");
417 rc
= hdev
->asic_funcs
->load_boot_fit_to_device(hdev
);
421 /* Clear device CPU message status */
422 WREG32(cpu_msg_status_reg
, CPU_MSG_CLR
);
424 /* Signal device CPU that boot loader is ready */
425 WREG32(msg_to_cpu_reg
, KMD_MSG_FIT_RDY
);
427 /* Poll for CPU device ack */
428 rc
= hl_poll_timeout(
432 status
== CPU_MSG_OK
,
438 "Timeout waiting for boot fit load ack\n");
443 WREG32(msg_to_cpu_reg
, KMD_MSG_NA
);
446 /* Make sure CPU boot-loader is running */
447 rc
= hl_poll_timeout(
451 (status
== CPU_BOOT_STATUS_DRAM_RDY
) ||
452 (status
== CPU_BOOT_STATUS_NIC_FW_RDY
) ||
453 (status
== CPU_BOOT_STATUS_READY_TO_BOOT
) ||
454 (status
== CPU_BOOT_STATUS_SRAM_AVAIL
),
458 /* Read U-Boot, preboot versions now in case we will later fail */
459 hdev
->asic_funcs
->read_device_fw_version(hdev
, FW_COMP_UBOOT
);
460 hdev
->asic_funcs
->read_device_fw_version(hdev
, FW_COMP_PREBOOT
);
462 /* Some of the status codes below are deprecated in newer f/w
463 * versions but we keep them here for backward compatibility
467 case CPU_BOOT_STATUS_NA
:
469 "Device boot error - BTL did NOT run\n");
471 case CPU_BOOT_STATUS_IN_WFE
:
473 "Device boot error - Stuck inside WFE loop\n");
475 case CPU_BOOT_STATUS_IN_BTL
:
477 "Device boot error - Stuck in BTL\n");
479 case CPU_BOOT_STATUS_IN_PREBOOT
:
481 "Device boot error - Stuck in Preboot\n");
483 case CPU_BOOT_STATUS_IN_SPL
:
485 "Device boot error - Stuck in SPL\n");
487 case CPU_BOOT_STATUS_IN_UBOOT
:
489 "Device boot error - Stuck in u-boot\n");
491 case CPU_BOOT_STATUS_DRAM_INIT_FAIL
:
493 "Device boot error - DRAM initialization failed\n");
495 case CPU_BOOT_STATUS_UBOOT_NOT_READY
:
497 "Device boot error - u-boot stopped by user\n");
499 case CPU_BOOT_STATUS_TS_INIT_FAIL
:
501 "Device boot error - Thermal Sensor initialization failed\n");
505 "Device boot error - Invalid status code %d\n",
514 if (!hdev
->fw_loading
) {
515 dev_info(hdev
->dev
, "Skip loading FW\n");
519 if (status
== CPU_BOOT_STATUS_SRAM_AVAIL
)
523 "Loading firmware to device, may take some time...\n");
525 rc
= hdev
->asic_funcs
->load_firmware_to_device(hdev
);
530 WREG32(msg_to_cpu_reg
, KMD_MSG_SKIP_BMC
);
532 rc
= hl_poll_timeout(
536 (status
== CPU_BOOT_STATUS_BMC_WAITING_SKIPPED
),
542 "Failed to get ACK on skipping BMC, %d\n",
544 WREG32(msg_to_cpu_reg
, KMD_MSG_NA
);
550 WREG32(msg_to_cpu_reg
, KMD_MSG_FIT_RDY
);
552 rc
= hl_poll_timeout(
556 (status
== CPU_BOOT_STATUS_SRAM_AVAIL
),
561 WREG32(msg_to_cpu_reg
, KMD_MSG_NA
);
564 if (status
== CPU_BOOT_STATUS_FIT_CORRUPTED
)
566 "Device reports FIT image is corrupted\n");
569 "Device failed to load, %d\n", status
);
575 dev_info(hdev
->dev
, "Successfully loaded firmware to device\n");
578 fw_read_errors(hdev
, boot_err0_reg
);