Merge changes Ide272fd6,Ia2a5c9b0,I373de721,I68cbe5cc,I0fbd6578,I3f0350af,I7123f6b5,Icab1f4e5,Icffe3482 into msm-3.4
* changes: msm: kgsl: In recovery search for command stream after global eop msm: kgsl: Try to replay commands from bad context in recovery msm: kgsl: Expire timestamps after recovery msm: kgsl: Mark the hung context in recovery before extraction msm: kgsl: Write a separate function to set the reset status msm: kgsl: Do not restore per context timestamp states msm: kgsl: Turn on preamble to enable replay of commands msm: kgsl: Separate function to detect last command in recovery msm: kgsl: Create a separate function to extract valid commands
This commit is contained in:
commit
19ffe56a37
|
@ -796,121 +796,59 @@ static int adreno_stop(struct kgsl_device *device)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
adreno_recover_hang(struct kgsl_device *device,
|
||||
struct adreno_recovery_data *rec_data)
|
||||
static void adreno_mark_context_status(struct kgsl_device *device,
|
||||
int recovery_status)
|
||||
{
|
||||
int ret;
|
||||
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
|
||||
struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
|
||||
unsigned int timestamp;
|
||||
unsigned int reftimestamp;
|
||||
unsigned int enable_ts;
|
||||
unsigned int soptimestamp;
|
||||
unsigned int eoptimestamp;
|
||||
struct kgsl_context *context;
|
||||
struct adreno_context *adreno_context;
|
||||
int next = 0;
|
||||
|
||||
KGSL_DRV_ERR(device,
|
||||
"Starting recovery from 3D GPU hang. Recovery parameters: IB1: 0x%X, "
|
||||
"Bad context_id: %u, global_eop: 0x%x\n", rec_data->ib1,
|
||||
rec_data->context_id, rec_data->global_eop);
|
||||
|
||||
/* Extract valid contents from rb which can stil be executed after
|
||||
* hang */
|
||||
ret = adreno_ringbuffer_extract(rb, rec_data);
|
||||
if (ret)
|
||||
goto done;
|
||||
|
||||
context = idr_find(&device->context_idr, rec_data->context_id);
|
||||
if (context == NULL) {
|
||||
KGSL_DRV_ERR(device, "Last context unknown id:%d\n",
|
||||
rec_data->context_id);
|
||||
rec_data->context_id = KGSL_MEMSTORE_GLOBAL;
|
||||
}
|
||||
|
||||
timestamp = rb->timestamp[KGSL_MEMSTORE_GLOBAL];
|
||||
KGSL_DRV_ERR(device, "Last issued global timestamp: %x\n", timestamp);
|
||||
|
||||
kgsl_sharedmem_readl(&device->memstore, &reftimestamp,
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
ref_wait_ts));
|
||||
kgsl_sharedmem_readl(&device->memstore, &enable_ts,
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
ts_cmp_enable));
|
||||
kgsl_sharedmem_readl(&device->memstore, &soptimestamp,
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
soptimestamp));
|
||||
kgsl_sharedmem_readl(&device->memstore, &eoptimestamp,
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
eoptimestamp));
|
||||
/* Make sure memory is synchronized before restarting the GPU */
|
||||
mb();
|
||||
KGSL_CTXT_ERR(device,
|
||||
"Context id that caused a GPU hang: %d\n",
|
||||
rec_data->context_id);
|
||||
/* restart device */
|
||||
ret = adreno_stop(device);
|
||||
if (ret)
|
||||
goto done;
|
||||
ret = adreno_start(device, true);
|
||||
if (ret)
|
||||
goto done;
|
||||
KGSL_DRV_ERR(device, "Device has been restarted after hang\n");
|
||||
/* Restore timestamp states */
|
||||
kgsl_sharedmem_writel(&device->memstore,
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
soptimestamp), soptimestamp);
|
||||
kgsl_sharedmem_writel(&device->memstore,
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
eoptimestamp), eoptimestamp);
|
||||
|
||||
if (rec_data->rb_size) {
|
||||
kgsl_sharedmem_writel(&device->memstore,
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
ref_wait_ts), reftimestamp);
|
||||
kgsl_sharedmem_writel(&device->memstore,
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
ts_cmp_enable), enable_ts);
|
||||
}
|
||||
/* Make sure all writes are posted before the GPU reads them */
|
||||
wmb();
|
||||
/* Mark the invalid context so no more commands are accepted from
|
||||
* that context */
|
||||
|
||||
adreno_context = context->devctxt;
|
||||
|
||||
KGSL_CTXT_ERR(device,
|
||||
"Context that caused a GPU hang: %d\n", adreno_context->id);
|
||||
|
||||
adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
|
||||
|
||||
/*
|
||||
* Set the reset status of all contexts to
|
||||
* INNOCENT_CONTEXT_RESET_EXT except for the bad context
|
||||
* since thats the guilty party
|
||||
* since thats the guilty party, if recovery failed then
|
||||
* mark all as guilty
|
||||
*/
|
||||
while ((context = idr_get_next(&device->context_idr, &next))) {
|
||||
if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
|
||||
struct adreno_context *adreno_context = context->devctxt;
|
||||
if (recovery_status) {
|
||||
context->reset_status =
|
||||
KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
|
||||
adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
|
||||
} else if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
|
||||
context->reset_status) {
|
||||
if (context->id != rec_data->context_id)
|
||||
context->reset_status =
|
||||
KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT;
|
||||
else
|
||||
if (adreno_context->flags & (CTXT_FLAGS_GPU_HANG ||
|
||||
CTXT_FLAGS_GPU_HANG_RECOVERED))
|
||||
context->reset_status =
|
||||
KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
|
||||
else
|
||||
context->reset_status =
|
||||
KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT;
|
||||
}
|
||||
next = next + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Restore valid commands in ringbuffer */
|
||||
adreno_ringbuffer_restore(rb, rec_data->rb_buffer, rec_data->rb_size);
|
||||
rb->timestamp[KGSL_MEMSTORE_GLOBAL] = timestamp;
|
||||
/* wait for idle */
|
||||
ret = adreno_idle(device, KGSL_TIMEOUT_DEFAULT);
|
||||
done:
|
||||
return ret;
|
||||
static void adreno_set_max_ts_for_bad_ctxs(struct kgsl_device *device)
|
||||
{
|
||||
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
|
||||
struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
|
||||
struct kgsl_context *context;
|
||||
struct adreno_context *temp_adreno_context;
|
||||
int next = 0;
|
||||
|
||||
while ((context = idr_get_next(&device->context_idr, &next))) {
|
||||
temp_adreno_context = context->devctxt;
|
||||
if (temp_adreno_context->flags & CTXT_FLAGS_GPU_HANG) {
|
||||
kgsl_sharedmem_writel(&device->memstore,
|
||||
KGSL_MEMSTORE_OFFSET(context->id,
|
||||
soptimestamp),
|
||||
rb->timestamp[context->id]);
|
||||
kgsl_sharedmem_writel(&device->memstore,
|
||||
KGSL_MEMSTORE_OFFSET(context->id,
|
||||
eoptimestamp),
|
||||
rb->timestamp[context->id]);
|
||||
}
|
||||
next = next + 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void adreno_destroy_recovery_data(struct adreno_recovery_data *rec_data)
|
||||
|
@ -966,7 +904,179 @@ done:
|
|||
return ret;
|
||||
}
|
||||
|
||||
int adreno_dump_and_recover(struct kgsl_device *device)
|
||||
static int
|
||||
_adreno_recover_hang(struct kgsl_device *device,
|
||||
struct adreno_recovery_data *rec_data,
|
||||
bool try_bad_commands)
|
||||
{
|
||||
int ret;
|
||||
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
|
||||
struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
|
||||
struct kgsl_context *context;
|
||||
struct adreno_context *adreno_context = NULL;
|
||||
struct adreno_context *last_active_ctx = adreno_dev->drawctxt_active;
|
||||
|
||||
context = idr_find(&device->context_idr, rec_data->context_id);
|
||||
if (context == NULL) {
|
||||
KGSL_DRV_ERR(device, "Last context unknown id:%d\n",
|
||||
rec_data->context_id);
|
||||
} else {
|
||||
adreno_context = context->devctxt;
|
||||
adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
|
||||
}
|
||||
|
||||
/* Extract valid contents from rb which can still be executed after
|
||||
* hang */
|
||||
ret = adreno_ringbuffer_extract(rb, rec_data);
|
||||
if (ret)
|
||||
goto done;
|
||||
|
||||
/* restart device */
|
||||
ret = adreno_stop(device);
|
||||
if (ret) {
|
||||
KGSL_DRV_ERR(device, "Device stop failed in recovery\n");
|
||||
goto done;
|
||||
}
|
||||
|
||||
ret = adreno_start(device, true);
|
||||
if (ret) {
|
||||
KGSL_DRV_ERR(device, "Device start failed in recovery\n");
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (context)
|
||||
kgsl_mmu_setstate(&device->mmu, adreno_context->pagetable,
|
||||
KGSL_MEMSTORE_GLOBAL);
|
||||
|
||||
/* Do not try the bad caommands if recovery has failed bad commands
|
||||
* once already */
|
||||
if (!try_bad_commands)
|
||||
rec_data->bad_rb_size = 0;
|
||||
|
||||
if (rec_data->bad_rb_size) {
|
||||
int idle_ret;
|
||||
/* submit the bad and good context commands and wait for
|
||||
* them to pass */
|
||||
adreno_ringbuffer_restore(rb, rec_data->bad_rb_buffer,
|
||||
rec_data->bad_rb_size);
|
||||
idle_ret = adreno_idle(device, KGSL_TIMEOUT_DEFAULT);
|
||||
if (idle_ret) {
|
||||
ret = adreno_stop(device);
|
||||
if (ret) {
|
||||
KGSL_DRV_ERR(device,
|
||||
"Device stop failed in recovery\n");
|
||||
goto done;
|
||||
}
|
||||
ret = adreno_start(device, true);
|
||||
if (ret) {
|
||||
KGSL_DRV_ERR(device,
|
||||
"Device start failed in recovery\n");
|
||||
goto done;
|
||||
}
|
||||
ret = idle_ret;
|
||||
KGSL_DRV_ERR(device,
|
||||
"Bad context commands hung in recovery\n");
|
||||
} else {
|
||||
KGSL_DRV_ERR(device,
|
||||
"Bad context commands succeeded in recovery\n");
|
||||
if (adreno_context)
|
||||
adreno_context->flags = (adreno_context->flags &
|
||||
~CTXT_FLAGS_GPU_HANG) |
|
||||
CTXT_FLAGS_GPU_HANG_RECOVERED;
|
||||
adreno_dev->drawctxt_active = last_active_ctx;
|
||||
}
|
||||
}
|
||||
/* If either the bad command sequence failed or we did not play it */
|
||||
if (ret || !rec_data->bad_rb_size) {
|
||||
adreno_ringbuffer_restore(rb, rec_data->rb_buffer,
|
||||
rec_data->rb_size);
|
||||
ret = adreno_idle(device, KGSL_TIMEOUT_DEFAULT);
|
||||
if (ret) {
|
||||
/* If we fail here we can try to invalidate another
|
||||
* context and try recovering again */
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
}
|
||||
/* ringbuffer now has data from the last valid context id,
|
||||
* so restore the active_ctx to the last valid context */
|
||||
if (rec_data->last_valid_ctx_id) {
|
||||
struct kgsl_context *last_ctx =
|
||||
idr_find(&device->context_idr,
|
||||
rec_data->last_valid_ctx_id);
|
||||
if (last_ctx)
|
||||
adreno_dev->drawctxt_active = last_ctx->devctxt;
|
||||
}
|
||||
}
|
||||
done:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
adreno_recover_hang(struct kgsl_device *device,
|
||||
struct adreno_recovery_data *rec_data)
|
||||
{
|
||||
int ret = 0;
|
||||
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
|
||||
struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
|
||||
unsigned int timestamp;
|
||||
|
||||
KGSL_DRV_ERR(device,
|
||||
"Starting recovery from 3D GPU hang. Recovery parameters: IB1: 0x%X, "
|
||||
"Bad context_id: %u, global_eop: 0x%x\n",
|
||||
rec_data->ib1, rec_data->context_id, rec_data->global_eop);
|
||||
|
||||
timestamp = rb->timestamp[KGSL_MEMSTORE_GLOBAL];
|
||||
KGSL_DRV_ERR(device, "Last issued global timestamp: %x\n", timestamp);
|
||||
|
||||
/* We may need to replay commands multiple times based on whether
|
||||
* multiple contexts hang the GPU */
|
||||
while (true) {
|
||||
if (!ret)
|
||||
ret = _adreno_recover_hang(device, rec_data, true);
|
||||
else
|
||||
ret = _adreno_recover_hang(device, rec_data, false);
|
||||
|
||||
if (-EAGAIN == ret) {
|
||||
/* setup new recovery parameters and retry, this
|
||||
* means more than 1 contexts are causing hang */
|
||||
adreno_destroy_recovery_data(rec_data);
|
||||
adreno_setup_recovery_data(device, rec_data);
|
||||
KGSL_DRV_ERR(device,
|
||||
"Retry recovery from 3D GPU hang. Recovery parameters: "
|
||||
"IB1: 0x%X, Bad context_id: %u, global_eop: 0x%x\n",
|
||||
rec_data->ib1, rec_data->context_id,
|
||||
rec_data->global_eop);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret)
|
||||
goto done;
|
||||
|
||||
/* Restore correct states after recovery */
|
||||
if (adreno_dev->drawctxt_active)
|
||||
device->mmu.hwpagetable =
|
||||
adreno_dev->drawctxt_active->pagetable;
|
||||
else
|
||||
device->mmu.hwpagetable = device->mmu.defaultpagetable;
|
||||
rb->timestamp[KGSL_MEMSTORE_GLOBAL] = timestamp;
|
||||
kgsl_sharedmem_writel(&device->memstore,
|
||||
KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
|
||||
eoptimestamp),
|
||||
rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
|
||||
done:
|
||||
adreno_set_max_ts_for_bad_ctxs(device);
|
||||
adreno_mark_context_status(device, ret);
|
||||
if (!ret)
|
||||
KGSL_DRV_ERR(device, "Recovery succeeded\n");
|
||||
else
|
||||
KGSL_DRV_ERR(device, "Recovery failed\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
adreno_dump_and_recover(struct kgsl_device *device)
|
||||
{
|
||||
int result = -ETIMEDOUT;
|
||||
struct adreno_recovery_data rec_data;
|
||||
|
|
|
@ -44,6 +44,8 @@
|
|||
#define CTXT_FLAGS_TRASHSTATE 0x00020000
|
||||
/* per context timestamps enabled */
|
||||
#define CTXT_FLAGS_PER_CONTEXT_TS 0x00040000
|
||||
/* Context has caused a GPU hang and recovered properly */
|
||||
#define CTXT_FLAGS_GPU_HANG_RECOVERED 0x00008000
|
||||
|
||||
struct kgsl_device;
|
||||
struct adreno_device;
|
||||
|
|
|
@ -942,177 +942,347 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
|
|||
*/
|
||||
adreno_idle(device, KGSL_TIMEOUT_DEFAULT);
|
||||
#endif
|
||||
/* If context hung and recovered then return error so that the
|
||||
* application may handle it */
|
||||
if (drawctxt->flags & CTXT_FLAGS_GPU_HANG_RECOVERED)
|
||||
return -EDEADLK;
|
||||
else
|
||||
return 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int _find_start_of_cmd_seq(struct adreno_ringbuffer *rb,
|
||||
unsigned int *ptr,
|
||||
bool inc)
|
||||
{
|
||||
int status = -EINVAL;
|
||||
unsigned int val1;
|
||||
unsigned int size = rb->buffer_desc.size;
|
||||
unsigned int start_ptr = *ptr;
|
||||
|
||||
while ((start_ptr / sizeof(unsigned int)) != rb->wptr) {
|
||||
if (inc)
|
||||
start_ptr = adreno_ringbuffer_inc_wrapped(start_ptr,
|
||||
size);
|
||||
else
|
||||
start_ptr = adreno_ringbuffer_dec_wrapped(start_ptr,
|
||||
size);
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val1, start_ptr);
|
||||
if (KGSL_CMD_IDENTIFIER == val1) {
|
||||
if ((start_ptr / sizeof(unsigned int)) != rb->wptr)
|
||||
start_ptr = adreno_ringbuffer_dec_wrapped(
|
||||
start_ptr, size);
|
||||
*ptr = start_ptr;
|
||||
status = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
static int _find_cmd_seq_after_eop_ts(struct adreno_ringbuffer *rb,
|
||||
unsigned int *rb_rptr,
|
||||
unsigned int global_eop,
|
||||
bool inc)
|
||||
{
|
||||
int status = -EINVAL;
|
||||
unsigned int temp_rb_rptr = *rb_rptr;
|
||||
unsigned int size = rb->buffer_desc.size;
|
||||
unsigned int val[3];
|
||||
int i = 0;
|
||||
bool check = false;
|
||||
|
||||
if (inc && temp_rb_rptr / sizeof(unsigned int) != rb->wptr)
|
||||
return status;
|
||||
|
||||
do {
|
||||
/* when decrementing we need to decrement first and
|
||||
* then read make sure we cover all the data */
|
||||
if (!inc)
|
||||
temp_rb_rptr = adreno_ringbuffer_dec_wrapped(
|
||||
temp_rb_rptr, size);
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val[i],
|
||||
temp_rb_rptr);
|
||||
|
||||
if (check && ((inc && val[i] == global_eop) ||
|
||||
(!inc && (val[i] ==
|
||||
cp_type3_packet(CP_MEM_WRITE, 2) ||
|
||||
val[i] == CACHE_FLUSH_TS)))) {
|
||||
/* decrement i, i.e i = (i - 1 + 3) % 3 if
|
||||
* we are going forward, else increment i */
|
||||
i = (i + 2) % 3;
|
||||
if (val[i] == rb->device->memstore.gpuaddr +
|
||||
KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
|
||||
eoptimestamp)) {
|
||||
int j = ((i + 2) % 3);
|
||||
if ((inc && (val[j] == CACHE_FLUSH_TS ||
|
||||
val[j] == cp_type3_packet(
|
||||
CP_MEM_WRITE, 2))) ||
|
||||
(!inc && val[j] == global_eop)) {
|
||||
/* Found the global eop */
|
||||
status = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* if no match found then increment i again
|
||||
* since we decremented before matching */
|
||||
i = (i + 1) % 3;
|
||||
}
|
||||
if (inc)
|
||||
temp_rb_rptr = adreno_ringbuffer_inc_wrapped(
|
||||
temp_rb_rptr, size);
|
||||
|
||||
i = (i + 1) % 3;
|
||||
if (2 == i)
|
||||
check = true;
|
||||
} while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr);
|
||||
/* temp_rb_rptr points to the command stream after global eop,
|
||||
* move backward till the start of command sequence */
|
||||
if (!status) {
|
||||
status = _find_start_of_cmd_seq(rb, &temp_rb_rptr, false);
|
||||
if (!status) {
|
||||
*rb_rptr = temp_rb_rptr;
|
||||
KGSL_DRV_ERR(rb->device,
|
||||
"Offset of cmd sequence after eop timestamp: 0x%x\n",
|
||||
temp_rb_rptr / sizeof(unsigned int));
|
||||
}
|
||||
}
|
||||
if (status)
|
||||
KGSL_DRV_ERR(rb->device,
|
||||
"Failed to find the command sequence after eop timestamp\n");
|
||||
return status;
|
||||
}
|
||||
|
||||
static int _find_hanging_ib_sequence(struct adreno_ringbuffer *rb,
|
||||
unsigned int *rb_rptr,
|
||||
unsigned int ib1)
|
||||
{
|
||||
int status = -EINVAL;
|
||||
unsigned int temp_rb_rptr = *rb_rptr;
|
||||
unsigned int size = rb->buffer_desc.size;
|
||||
unsigned int val[2];
|
||||
int i = 0;
|
||||
bool check = false;
|
||||
bool ctx_switch = false;
|
||||
|
||||
while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr) {
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val[i], temp_rb_rptr);
|
||||
|
||||
if (check && val[i] == ib1) {
|
||||
/* decrement i, i.e i = (i - 1 + 2) % 2 */
|
||||
i = (i + 1) % 2;
|
||||
if (adreno_cmd_is_ib(val[i])) {
|
||||
/* go till start of command sequence */
|
||||
status = _find_start_of_cmd_seq(rb,
|
||||
&temp_rb_rptr, false);
|
||||
KGSL_DRV_ERR(rb->device,
|
||||
"Found the hanging IB at offset 0x%x\n",
|
||||
temp_rb_rptr / sizeof(unsigned int));
|
||||
break;
|
||||
}
|
||||
/* if no match the increment i since we decremented
|
||||
* before checking */
|
||||
i = (i + 1) % 2;
|
||||
}
|
||||
/* Make sure you do not encounter a context switch twice, we can
|
||||
* encounter it once for the bad context as the start of search
|
||||
* can point to the context switch */
|
||||
if (val[i] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
|
||||
if (ctx_switch) {
|
||||
KGSL_DRV_ERR(rb->device,
|
||||
"Context switch encountered before bad "
|
||||
"IB found\n");
|
||||
break;
|
||||
}
|
||||
ctx_switch = true;
|
||||
}
|
||||
i = (i + 1) % 2;
|
||||
if (1 == i)
|
||||
check = true;
|
||||
temp_rb_rptr = adreno_ringbuffer_inc_wrapped(temp_rb_rptr,
|
||||
size);
|
||||
}
|
||||
if (!status)
|
||||
*rb_rptr = temp_rb_rptr;
|
||||
return status;
|
||||
}
|
||||
|
||||
static void _turn_preamble_on_for_ib_seq(struct adreno_ringbuffer *rb,
|
||||
unsigned int rb_rptr)
|
||||
{
|
||||
unsigned int temp_rb_rptr = rb_rptr;
|
||||
unsigned int size = rb->buffer_desc.size;
|
||||
unsigned int val[2];
|
||||
int i = 0;
|
||||
bool check = false;
|
||||
bool cmd_start = false;
|
||||
|
||||
/* Go till the start of the ib sequence and turn on preamble */
|
||||
while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr) {
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val[i], temp_rb_rptr);
|
||||
if (check && KGSL_START_OF_IB_IDENTIFIER == val[i]) {
|
||||
/* decrement i */
|
||||
i = (i + 1) % 2;
|
||||
if (val[i] == cp_nop_packet(4)) {
|
||||
temp_rb_rptr = adreno_ringbuffer_dec_wrapped(
|
||||
temp_rb_rptr, size);
|
||||
kgsl_sharedmem_writel(&rb->buffer_desc,
|
||||
temp_rb_rptr, cp_nop_packet(1));
|
||||
}
|
||||
KGSL_DRV_ERR(rb->device,
|
||||
"Turned preamble on at offset 0x%x\n",
|
||||
temp_rb_rptr / 4);
|
||||
break;
|
||||
}
|
||||
/* If you reach beginning of next command sequence then exit
|
||||
* First command encountered is the current one so don't break
|
||||
* on that. */
|
||||
if (KGSL_CMD_IDENTIFIER == val[i]) {
|
||||
if (cmd_start)
|
||||
break;
|
||||
cmd_start = true;
|
||||
}
|
||||
|
||||
i = (i + 1) % 2;
|
||||
if (1 == i)
|
||||
check = true;
|
||||
temp_rb_rptr = adreno_ringbuffer_inc_wrapped(temp_rb_rptr,
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
static void _copy_valid_rb_content(struct adreno_ringbuffer *rb,
|
||||
unsigned int rb_rptr, unsigned int *temp_rb_buffer,
|
||||
int *rb_size, unsigned int *bad_rb_buffer,
|
||||
int *bad_rb_size,
|
||||
int *last_valid_ctx_id)
|
||||
{
|
||||
unsigned int good_rb_idx = 0, cmd_start_idx = 0;
|
||||
unsigned int val1 = 0;
|
||||
struct kgsl_context *k_ctxt;
|
||||
struct adreno_context *a_ctxt;
|
||||
unsigned int bad_rb_idx = 0;
|
||||
int copy_rb_contents = 0;
|
||||
unsigned int temp_rb_rptr;
|
||||
unsigned int size = rb->buffer_desc.size;
|
||||
unsigned int good_cmd_start_idx = 0;
|
||||
|
||||
/* Walk the rb from the context switch. Omit any commands
|
||||
* for an invalid context. */
|
||||
while ((rb_rptr / sizeof(unsigned int)) != rb->wptr) {
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val1, rb_rptr);
|
||||
|
||||
if (KGSL_CMD_IDENTIFIER == val1) {
|
||||
/* Start is the NOP dword that comes before
|
||||
* KGSL_CMD_IDENTIFIER */
|
||||
cmd_start_idx = bad_rb_idx - 1;
|
||||
if (copy_rb_contents)
|
||||
good_cmd_start_idx = good_rb_idx - 1;
|
||||
}
|
||||
|
||||
/* check for context switch indicator */
|
||||
if (val1 == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
|
||||
unsigned int temp_idx, val2;
|
||||
/* increment by 3 to get to the context_id */
|
||||
temp_rb_rptr = rb_rptr + (3 * sizeof(unsigned int)) %
|
||||
size;
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val2,
|
||||
temp_rb_rptr);
|
||||
|
||||
/* if context switches to a context that did not cause
|
||||
* hang then start saving the rb contents as those
|
||||
* commands can be executed */
|
||||
k_ctxt = idr_find(&rb->device->context_idr, val2);
|
||||
if (k_ctxt) {
|
||||
a_ctxt = k_ctxt->devctxt;
|
||||
|
||||
/* If we are changing to a good context and were not
|
||||
* copying commands then copy over commands to the good
|
||||
* context */
|
||||
if (!copy_rb_contents && ((k_ctxt &&
|
||||
!(a_ctxt->flags & CTXT_FLAGS_GPU_HANG)) ||
|
||||
!k_ctxt)) {
|
||||
for (temp_idx = cmd_start_idx;
|
||||
temp_idx < bad_rb_idx;
|
||||
temp_idx++)
|
||||
temp_rb_buffer[good_rb_idx++] =
|
||||
bad_rb_buffer[temp_idx];
|
||||
*last_valid_ctx_id = val2;
|
||||
copy_rb_contents = 1;
|
||||
} else if (copy_rb_contents && k_ctxt &&
|
||||
(a_ctxt->flags & CTXT_FLAGS_GPU_HANG)) {
|
||||
/* If we are changing to bad context then remove
|
||||
* the dwords we copied for this sequence from
|
||||
* the good buffer */
|
||||
good_rb_idx = good_cmd_start_idx;
|
||||
copy_rb_contents = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (copy_rb_contents)
|
||||
temp_rb_buffer[good_rb_idx++] = val1;
|
||||
/* Copy both good and bad commands for replay to the bad
|
||||
* buffer */
|
||||
bad_rb_buffer[bad_rb_idx++] = val1;
|
||||
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr, size);
|
||||
}
|
||||
*rb_size = good_rb_idx;
|
||||
*bad_rb_size = bad_rb_idx;
|
||||
}
|
||||
|
||||
int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
|
||||
struct adreno_recovery_data *rec_data)
|
||||
{
|
||||
int status;
|
||||
struct kgsl_device *device = rb->device;
|
||||
unsigned int rb_rptr = rb->wptr * sizeof(unsigned int);
|
||||
unsigned int temp_idx = 0;
|
||||
unsigned int value;
|
||||
unsigned int val1;
|
||||
unsigned int val2;
|
||||
unsigned int val3;
|
||||
unsigned int copy_rb_contents = 0;
|
||||
struct kgsl_context *context;
|
||||
unsigned int *temp_rb_buffer = rec_data->rb_buffer;
|
||||
struct adreno_context *adreno_context;
|
||||
|
||||
KGSL_DRV_ERR(device, "Last context id: %d\n", rec_data->context_id);
|
||||
context = idr_find(&device->context_idr, rec_data->context_id);
|
||||
if (context == NULL) {
|
||||
KGSL_DRV_ERR(device,
|
||||
"GPU recovery from hang not possible because last"
|
||||
" context id is invalid.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
KGSL_DRV_ERR(device, "GPU successfully executed till ts: %x\n",
|
||||
rec_data->global_eop);
|
||||
/*
|
||||
* We need to go back in history by 4 dwords from the current location
|
||||
* of read pointer as 4 dwords are read to match the end of a command.
|
||||
* Also, take care of wrap around when moving back
|
||||
*/
|
||||
if (rb->rptr >= 4)
|
||||
rb_rptr = (rb->rptr - 4) * sizeof(unsigned int);
|
||||
else
|
||||
rb_rptr = rb->buffer_desc.size -
|
||||
((4 - rb->rptr) * sizeof(unsigned int));
|
||||
/* Read the rb contents going backwards to locate end of last
|
||||
* sucessfully executed command */
|
||||
while ((rb_rptr / sizeof(unsigned int)) != rb->wptr) {
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr);
|
||||
if (value == rec_data->global_eop) {
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size);
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val1, rb_rptr);
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size);
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val2, rb_rptr);
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size);
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val3, rb_rptr);
|
||||
/* match the pattern found at the end of a command */
|
||||
if ((val1 == 2 &&
|
||||
val2 == cp_type3_packet(CP_INTERRUPT, 1)
|
||||
&& val3 == CP_INT_CNTL__RB_INT_MASK) ||
|
||||
(val1 == cp_type3_packet(CP_EVENT_WRITE, 3)
|
||||
&& val2 == CACHE_FLUSH_TS &&
|
||||
val3 == (rb->device->memstore.gpuaddr +
|
||||
KGSL_MEMSTORE_OFFSET(rec_data->context_id,
|
||||
eoptimestamp)))) {
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size);
|
||||
KGSL_DRV_ERR(device,
|
||||
"Found end of last executed "
|
||||
"command at offset: %x\n",
|
||||
rb_rptr / sizeof(unsigned int));
|
||||
break;
|
||||
} else {
|
||||
if (rb_rptr < (3 * sizeof(unsigned int)))
|
||||
rb_rptr = rb->buffer_desc.size -
|
||||
(3 * sizeof(unsigned int))
|
||||
+ rb_rptr;
|
||||
else
|
||||
rb_rptr -= (3 * sizeof(unsigned int));
|
||||
|
||||
/* Look for the command stream that is right after the global eop */
|
||||
status = _find_cmd_seq_after_eop_ts(rb, &rb_rptr,
|
||||
rec_data->global_eop + 1, false);
|
||||
if (status)
|
||||
goto done;
|
||||
|
||||
if (context) {
|
||||
adreno_context = context->devctxt;
|
||||
|
||||
if (adreno_context->flags & CTXT_FLAGS_PREAMBLE) {
|
||||
if (rec_data->ib1) {
|
||||
status = _find_hanging_ib_sequence(rb, &rb_rptr,
|
||||
rec_data->ib1);
|
||||
if (status)
|
||||
goto copy_rb_contents;
|
||||
}
|
||||
_turn_preamble_on_for_ib_seq(rb, rb_rptr);
|
||||
} else {
|
||||
status = -EINVAL;
|
||||
}
|
||||
|
||||
if (rb_rptr == 0)
|
||||
rb_rptr = rb->buffer_desc.size - sizeof(unsigned int);
|
||||
else
|
||||
rb_rptr -= sizeof(unsigned int);
|
||||
}
|
||||
|
||||
if ((rb_rptr / sizeof(unsigned int)) == rb->wptr) {
|
||||
KGSL_DRV_ERR(device,
|
||||
"GPU recovery from hang not possible because last"
|
||||
" successful timestamp is overwritten\n");
|
||||
return -EINVAL;
|
||||
copy_rb_contents:
|
||||
_copy_valid_rb_content(rb, rb_rptr, rec_data->rb_buffer,
|
||||
&rec_data->rb_size,
|
||||
rec_data->bad_rb_buffer,
|
||||
&rec_data->bad_rb_size,
|
||||
&rec_data->last_valid_ctx_id);
|
||||
/* If we failed to get the hanging IB sequence then we cannot execute
|
||||
* commands from the bad context or preambles not supported */
|
||||
if (status) {
|
||||
rec_data->bad_rb_size = 0;
|
||||
status = 0;
|
||||
}
|
||||
/* rb_rptr is now pointing to the first dword of the command following
|
||||
* the last sucessfully executed command sequence. Assumption is that
|
||||
* GPU is hung in the command sequence pointed by rb_rptr */
|
||||
/* make sure the GPU is not hung in a command submitted by kgsl
|
||||
* itself */
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val1, rb_rptr);
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val2,
|
||||
adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size));
|
||||
if (val1 == cp_nop_packet(1) && val2 == KGSL_CMD_IDENTIFIER) {
|
||||
KGSL_DRV_ERR(device,
|
||||
"GPU recovery from hang not possible because "
|
||||
"of hang in kgsl command\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
while ((rb_rptr / sizeof(unsigned int)) != rb->wptr) {
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr);
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size);
|
||||
/* check for context switch indicator */
|
||||
if (value == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr);
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size);
|
||||
BUG_ON(value != cp_type3_packet(CP_MEM_WRITE, 2));
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &val1, rb_rptr);
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size);
|
||||
BUG_ON(val1 != (device->memstore.gpuaddr +
|
||||
KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
|
||||
current_context)));
|
||||
kgsl_sharedmem_readl(&rb->buffer_desc, &value, rb_rptr);
|
||||
rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr,
|
||||
rb->buffer_desc.size);
|
||||
|
||||
/*
|
||||
* If other context switches were already lost and
|
||||
* and the current context is the one that is hanging,
|
||||
* then we cannot recover. Print an error message
|
||||
* and leave.
|
||||
*/
|
||||
|
||||
if ((copy_rb_contents == 0) && (value ==
|
||||
rec_data->context_id)) {
|
||||
KGSL_DRV_ERR(device, "GPU recovery could not "
|
||||
"find the previous context\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we were copying the commands and got to this point
|
||||
* then we need to remove the 3 commands that appear
|
||||
* before KGSL_CONTEXT_TO_MEM_IDENTIFIER
|
||||
*/
|
||||
if (temp_idx)
|
||||
temp_idx -= 3;
|
||||
/* if context switches to a context that did not cause
|
||||
* hang then start saving the rb contents as those
|
||||
* commands can be executed */
|
||||
if (value != rec_data->context_id) {
|
||||
copy_rb_contents = 1;
|
||||
temp_rb_buffer[temp_idx++] = cp_nop_packet(1);
|
||||
temp_rb_buffer[temp_idx++] =
|
||||
KGSL_CMD_IDENTIFIER;
|
||||
temp_rb_buffer[temp_idx++] = cp_nop_packet(1);
|
||||
temp_rb_buffer[temp_idx++] =
|
||||
KGSL_CONTEXT_TO_MEM_IDENTIFIER;
|
||||
temp_rb_buffer[temp_idx++] =
|
||||
cp_type3_packet(CP_MEM_WRITE, 2);
|
||||
temp_rb_buffer[temp_idx++] = val1;
|
||||
temp_rb_buffer[temp_idx++] = value;
|
||||
} else {
|
||||
copy_rb_contents = 0;
|
||||
}
|
||||
} else if (copy_rb_contents)
|
||||
temp_rb_buffer[temp_idx++] = value;
|
||||
}
|
||||
|
||||
rec_data->rb_size = temp_idx;
|
||||
return 0;
|
||||
/* If there is no context then that means there are no commands for
|
||||
* good case */
|
||||
if (!context)
|
||||
rec_data->rb_size = 0;
|
||||
done:
|
||||
return status;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -139,4 +139,11 @@ static inline unsigned int adreno_ringbuffer_inc_wrapped(unsigned int val,
|
|||
return (val + sizeof(unsigned int)) % size;
|
||||
}
|
||||
|
||||
/* Decrement a value by 4 bytes with wrap-around based on size */
|
||||
static inline unsigned int adreno_ringbuffer_dec_wrapped(unsigned int val,
|
||||
unsigned int size)
|
||||
{
|
||||
return (val + size - sizeof(unsigned int)) % size;
|
||||
}
|
||||
|
||||
#endif /* __ADRENO_RINGBUFFER_H */
|
||||
|
|
Loading…
Reference in New Issue