drm/amdkfd: Add GPU reset SMI event
Add support for reporting GPU reset events through SMI. KFD would report both pre and post GPU reset events. Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:

committed by
Alex Deucher

parent
e230ac1118
commit
55977744f9
@@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
|
|||||||
if (!kfd->init_complete)
|
if (!kfd->init_complete)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
kfd_smi_event_update_gpu_reset(kfd, false);
|
||||||
|
|
||||||
kfd->dqm->ops.pre_reset(kfd->dqm);
|
kfd->dqm->ops.pre_reset(kfd->dqm);
|
||||||
|
|
||||||
kgd2kfd_suspend(kfd, false);
|
kgd2kfd_suspend(kfd, false);
|
||||||
@@ -840,6 +842,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
|
|||||||
|
|
||||||
atomic_set(&kfd->sram_ecc_flag, 0);
|
atomic_set(&kfd->sram_ecc_flag, 0);
|
||||||
|
|
||||||
|
kfd_smi_event_update_gpu_reset(kfd, true);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -312,6 +312,8 @@ struct kfd_dev {
|
|||||||
/* Clients watching SMI events */
|
/* Clients watching SMI events */
|
||||||
struct list_head smi_clients;
|
struct list_head smi_clients;
|
||||||
spinlock_t smi_lock;
|
spinlock_t smi_lock;
|
||||||
|
|
||||||
|
uint32_t reset_seq_num;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum kfd_mempool {
|
enum kfd_mempool {
|
||||||
|
@@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event,
|
|||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* GpuReset msg = Reset seq number (incremented for
|
||||||
|
* every reset message sent before GPU reset).
|
||||||
|
* 1 byte event + 1 byte space + 8 bytes seq num +
|
||||||
|
* 1 byte \n + 1 byte \0 = 12
|
||||||
|
*/
|
||||||
|
char fifo_in[12];
|
||||||
|
int len;
|
||||||
|
unsigned int event;
|
||||||
|
|
||||||
|
if (list_empty(&dev->smi_clients))
|
||||||
|
return;
|
||||||
|
|
||||||
|
memset(fifo_in, 0x0, sizeof(fifo_in));
|
||||||
|
|
||||||
|
if (post_reset) {
|
||||||
|
event = KFD_SMI_EVENT_GPU_POST_RESET;
|
||||||
|
} else {
|
||||||
|
event = KFD_SMI_EVENT_GPU_PRE_RESET;
|
||||||
|
++(dev->reset_seq_num);
|
||||||
|
}
|
||||||
|
|
||||||
|
len = snprintf(fifo_in, sizeof(fifo_in), "%x %x\n", event,
|
||||||
|
dev->reset_seq_num);
|
||||||
|
|
||||||
|
add_event_to_kfifo(dev, event, fifo_in, len);
|
||||||
|
}
|
||||||
|
|
||||||
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
|
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
|
||||||
uint32_t throttle_bitmask)
|
uint32_t throttle_bitmask)
|
||||||
{
|
{
|
||||||
@@ -191,7 +221,7 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
|
|||||||
if (list_empty(&dev->smi_clients))
|
if (list_empty(&dev->smi_clients))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
len = snprintf(fifo_in, 29, "%x %x:%llx\n",
|
len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%llx\n",
|
||||||
KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
|
KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
|
||||||
atomic64_read(&adev->smu.throttle_int_counter));
|
atomic64_read(&adev->smu.throttle_int_counter));
|
||||||
|
|
||||||
@@ -218,7 +248,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
|
|||||||
if (!task_info.pid)
|
if (!task_info.pid)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
len = snprintf(fifo_in, 29, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
|
len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
|
||||||
task_info.pid, task_info.task_name);
|
task_info.pid, task_info.task_name);
|
||||||
|
|
||||||
add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
|
add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
|
||||||
|
@@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
|
|||||||
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
|
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
|
||||||
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
|
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
|
||||||
uint32_t throttle_bitmask);
|
uint32_t throttle_bitmask);
|
||||||
|
void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -453,6 +453,8 @@ enum kfd_smi_event {
|
|||||||
KFD_SMI_EVENT_NONE = 0, /* not used */
|
KFD_SMI_EVENT_NONE = 0, /* not used */
|
||||||
KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
|
KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
|
||||||
KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
|
KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
|
||||||
|
KFD_SMI_EVENT_GPU_PRE_RESET = 3,
|
||||||
|
KFD_SMI_EVENT_GPU_POST_RESET = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
|
#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
|
||||||
|
Reference in New Issue
Block a user