Skip to content

Commit 81c2585

Browse files
committed
Refactor error handling & tracing to be per-AFU
Also allow the default error handling behavior to be overidden by the user. Report the error values along with the message Signed-off-by: Alastair D'Silva <alastair@d-silva.org>
1 parent 7d10c26 commit 81c2585

9 files changed

Lines changed: 425 additions & 227 deletions

File tree

src/afu.c

Lines changed: 107 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,61 @@ size_t ocxl_afu_get_mmio_size(ocxl_afu_h afu)
154154
return my_afu->per_pasid_mmio.length;
155155
}
156156

157+
/**
158+
* @}
159+
*
160+
* @defgroup ocxl_afu_messages OpenCAPI AFU messages
161+
*
162+
* These functions control messages from libocxl, such as error messages and tracing
163+
*
164+
* @{
165+
*/
166+
167+
/**
168+
* Enable messages from libocxl
169+
*
170+
* Error messages, if enabled, are emitted by default on STDERR. This behaviour may be
171+
* overidden by ocxl_afu_set_error_message_handler().
172+
*
173+
* Tracing, if enabled, is always emitted on STDERR. It assists a developer by showing
174+
* detailed AFU information, as well as MMIO & IRQ interactions between the application
175+
* and the AFU. It does not show direct accesses to memory from the AFU.
176+
*
177+
* @param afu the AFU to enable message on
178+
* @param sources a bitwise OR of the message sources to enable (OCXL_ERRORS, OCXL_TRACING)
179+
* @see ocxl_afu_set_error_message_handler()
180+
*/
181+
void ocxl_afu_enable_messages(ocxl_afu_h afu, uint64_t sources)
182+
{
183+
ocxl_afu *my_afu = (ocxl_afu *) afu;
184+
185+
my_afu->verbose_errors = !!(sources & OCXL_ERRORS);
186+
my_afu->tracing = !!(sources & OCXL_TRACING);
187+
}
188+
189+
/**
190+
* Override the default handler for emitting error messages for an AFU
191+
*
192+
* The default error handler emits messages on STDERR, to override this behavior,
193+
* pass a callback to this function.
194+
*
195+
* The callback is responsible for prefixing and line termination.
196+
*
197+
* Typical use cases would be redirecting error messages to the application's own
198+
* logging/reporting mechanisms, and adding additional application-specific context
199+
* to the error messages.
200+
*
201+
* @param afu the AFU to override the error handler for
202+
* @param handler the new error message handler
203+
*/
204+
void ocxl_afu_set_error_message_handler(ocxl_afu_h afu, void (*handler)(ocxl_afu_h afu, ocxl_err error,
205+
const char *message))
206+
{
207+
ocxl_afu *my_afu = (ocxl_afu *) afu;
208+
209+
my_afu->error_handler = handler;
210+
}
211+
157212
/**
158213
* @}
159214
*
@@ -211,6 +266,11 @@ static void afu_init(ocxl_afu * afu)
211266

212267
afu->pasid = UINT32_MAX;
213268

269+
afu->verbose_errors = false;
270+
afu->error_handler = ocxl_default_afu_error_handler;
271+
272+
afu->tracing = false;
273+
214274
#ifdef _ARCH_PPC64
215275
afu->ppc64_amr = 0;
216276
#endif
@@ -227,8 +287,9 @@ static ocxl_err ocxl_afu_alloc(ocxl_afu_h * afu_out)
227287
{
228288
ocxl_afu *afu = malloc(sizeof(ocxl_afu));
229289
if (afu == NULL) {
230-
errmsg("Could not allocate %d bytes for AFU", sizeof(ocxl_afu));
231-
return OCXL_NO_MEM;
290+
ocxl_err rc = OCXL_NO_MEM;
291+
errmsg(NULL, rc, "Could not allocate %d bytes for AFU", sizeof(ocxl_afu));
292+
return rc;
232293
}
233294

234295
afu_init(afu);
@@ -281,13 +342,13 @@ static bool populate_metadata(dev_t dev, ocxl_afu * afu)
281342

282343
char *physical_function = strchr(dev_ent->d_name, '.');
283344
if (physical_function == NULL) {
284-
errmsg("Could not extract physical function from device name '%s', missing initial '.'",
345+
errmsg(NULL, OCXL_INTERNAL_ERROR, "Could not extract physical function from device name '%s', missing initial '.'",
285346
dev_ent->d_name);
286347
return false;
287348
}
288349
int afu_name_len = physical_function - dev_ent->d_name;
289350
if (afu_name_len > AFU_NAME_MAX) {
290-
errmsg("AFU name '%-.*s' exceeds maximum length of %d", afu_name_len, dev_ent->d_name);
351+
errmsg(NULL, OCXL_INTERNAL_ERROR,"AFU name '%-.*s' exceeds maximum length of %d", afu_name_len, dev_ent->d_name);
291352
return false;
292353
}
293354

@@ -298,7 +359,8 @@ static bool populate_metadata(dev_t dev, ocxl_afu * afu)
298359
&domain, &bus, &device, &function, &afu->identifier.afu_index);
299360

300361
if (found != 5) {
301-
errmsg("Could not parse physical function '%s', only got %d components", physical_function, found);
362+
errmsg(NULL, OCXL_INTERNAL_ERROR, "Could not parse physical function '%s', only got %d components", physical_function,
363+
found);
302364
return false;
303365
}
304366

@@ -308,15 +370,15 @@ static bool populate_metadata(dev_t dev, ocxl_afu * afu)
308370
size_t dev_path_len = strlen(DEVICE_PATH) + 1 + strlen(dev_ent->d_name) + 1;
309371
afu->device_path = malloc(dev_path_len);
310372
if (NULL == afu->device_path) {
311-
errmsg("Could not allocate %llu bytes for device path", dev_path_len);
373+
errmsg(NULL, OCXL_INTERNAL_ERROR, "Could not allocate %llu bytes for device path", dev_path_len);
312374
return false;
313375
}
314376
(void)snprintf(afu->device_path, dev_path_len, "%s/%s", DEVICE_PATH, dev_ent->d_name);
315377

316378
size_t sysfs_path_len = strlen(SYS_PATH) + 1 + strlen(dev_ent->d_name) + 1;
317379
afu->sysfs_path = malloc(sysfs_path_len);
318380
if (NULL == afu->sysfs_path) {
319-
errmsg("Could not allocate %llu bytes for sysfs path", sysfs_path_len);
381+
errmsg(NULL, OCXL_INTERNAL_ERROR, "Could not allocate %llu bytes for sysfs path", sysfs_path_len);
320382
return false;
321383
}
322384
(void)snprintf(afu->sysfs_path, sysfs_path_len, "%s/%s", SYS_PATH, dev_ent->d_name);
@@ -331,15 +393,15 @@ static bool populate_metadata(dev_t dev, ocxl_afu * afu)
331393
*/
332394
static void trace_metadata(ocxl_afu *afu)
333395
{
334-
TRACE("device path=\"%s\"", afu->device_path);
335-
TRACE("sysfs path=\"%s\"", afu->sysfs_path);
336-
TRACE("AFU Name=\"%s\"", afu->identifier.afu_name);
337-
TRACE("AFU Index=%u", afu->identifier.afu_index);
338-
TRACE("AFU Version=%u:%u", afu->version_major, afu->version_minor);
339-
TRACE("Global MMIO size=%llu", afu->global_mmio.length);
340-
TRACE("Per PASID MMIO size=%llu", afu->per_pasid_mmio.length);
341-
TRACE("Page Size=%llu", afu->page_size);
342-
TRACE("PASID=%lu", afu->pasid);
396+
TRACE(afu, "device path=\"%s\"", afu->device_path);
397+
TRACE(afu, "sysfs path=\"%s\"", afu->sysfs_path);
398+
TRACE(afu, "AFU Name=\"%s\"", afu->identifier.afu_name);
399+
TRACE(afu, "AFU Index=%u", afu->identifier.afu_index);
400+
TRACE(afu, "AFU Version=%u:%u", afu->version_major, afu->version_minor);
401+
TRACE(afu, "Global MMIO size=%llu", afu->global_mmio.length);
402+
TRACE(afu, "Per PASID MMIO size=%llu", afu->per_pasid_mmio.length);
403+
TRACE(afu, "Page Size=%llu", afu->page_size);
404+
TRACE(afu, "PASID=%lu", afu->pasid);
343405
}
344406

345407
/**
@@ -363,38 +425,44 @@ static ocxl_err afu_open(ocxl_afu *afu)
363425
int fd = open(afu->device_path, O_RDWR | O_CLOEXEC | O_NONBLOCK);
364426
if (fd < 0) {
365427
if (errno == ENOSPC) {
366-
errmsg("Could not open AFU device '%s', the maximum number of contexts has been reached: Error %d: %s",
428+
ocxl_err rc = OCXL_NO_MORE_CONTEXTS;
429+
errmsg(afu, rc, "Could not open AFU device '%s', the maximum number of contexts has been reached: Error %d: %s",
367430
afu->device_path, errno, strerror(errno));
368-
return OCXL_NO_MORE_CONTEXTS;
431+
return rc;
369432
}
370-
errmsg("Could not open AFU device '%s': Error %d: %s", afu->device_path, errno, strerror(errno));
371-
return OCXL_NO_DEV;
433+
434+
ocxl_err rc = OCXL_NO_DEV;
435+
errmsg(afu, rc, "Could not open AFU device '%s': Error %d: %s", afu->device_path, errno, strerror(errno));
436+
return rc;
372437
}
373438

374439
afu->fd = fd;
375440

376441
fd = epoll_create1(EPOLL_CLOEXEC);
377442
if (fd < 0) {
378-
errmsg("Could not create epoll descriptor. Error %d: %s",
443+
ocxl_err rc = OCXL_NO_DEV;
444+
errmsg(afu, rc, "Could not create epoll descriptor. Error %d: %s",
379445
errno, strerror(errno));
380-
return OCXL_NO_DEV;
446+
return rc;
381447
}
382448
afu->epoll_fd = fd;
383449

384450
struct epoll_event ev;
385451
ev.events = EPOLLIN;
386452
ev.data.ptr = &afu->fd_info; // Already set up in afu_init
387453
if (epoll_ctl(afu->epoll_fd, EPOLL_CTL_ADD, afu->fd, &ev) == -1) {
388-
errmsg("Could not add device fd %d to epoll fd %d for AFU '%s': %d: '%s'",
454+
ocxl_err rc = OCXL_NO_DEV;
455+
errmsg(afu, rc, "Could not add device fd %d to epoll fd %d for AFU '%s': %d: '%s'",
389456
afu->fd, afu->epoll_fd, afu->identifier.afu_name,
390457
errno, strerror(errno));
391-
return OCXL_NO_DEV;
458+
return rc;
392459
}
393460

394461
struct ocxl_ioctl_metadata metadata;
395462
if (ioctl(afu->fd, OCXL_IOCTL_GET_METADATA, &metadata)) {
396-
errmsg("OCXL_IOCTL_GET_METADATA failed %d:%s", errno, strerror(errno));
397-
return OCXL_NO_DEV;
463+
ocxl_err rc = OCXL_NO_DEV;
464+
errmsg(afu, rc, "OCXL_IOCTL_GET_METADATA failed %d:%s", errno, strerror(errno));
465+
return rc;
398466
}
399467

400468
if (metadata.version >= 0) {
@@ -405,7 +473,7 @@ static ocxl_err afu_open(ocxl_afu *afu)
405473
afu->pasid = metadata.pasid;
406474
}
407475

408-
if (tracing) {
476+
if (afu->tracing) {
409477
trace_metadata(afu);
410478
}
411479

@@ -434,16 +502,18 @@ static ocxl_err get_afu_by_path(const char *path, ocxl_afu_h * afu)
434502

435503
struct stat dev_stats;
436504
if (stat(path, &dev_stats)) {
437-
errmsg("Could not stat AFU device '%s': Error %d: %s", path, errno, strerror(errno));
505+
ocxl_err rc = OCXL_NO_DEV;
506+
errmsg(NULL, rc, "Could not stat AFU device '%s': Error %d: %s", path, errno, strerror(errno));
438507
*afu = OCXL_INVALID_AFU;
439-
return OCXL_NO_DEV;
508+
return rc;
440509
}
441510

442511
if (!populate_metadata(dev_stats.st_rdev, my_afu)) {
443-
errmsg("Could not find OCXL device for '%s', major=%d, minor=%d, device expected in '%s'",
512+
ocxl_err rc = OCXL_NO_DEV;
513+
errmsg(NULL, rc, "Could not find OCXL device for '%s', major=%d, minor=%d, device expected in '%s'",
444514
path, major(dev_stats.st_rdev), minor(dev_stats.st_rdev), DEVICE_PATH);
445515
*afu = OCXL_INVALID_AFU;
446-
return OCXL_NO_DEV;
516+
return rc;
447517
}
448518

449519
*afu = afu_h;
@@ -515,15 +585,15 @@ ocxl_err ocxl_afu_open_specific(const char *name, const char *physical_function,
515585
case 0:
516586
break;
517587
case GLOB_NOSPACE:
518-
errmsg("No memory for glob while listing AFUs");
519588
ret = OCXL_NO_MEM;
589+
errmsg(NULL, ret, "No memory for glob while listing AFUs");
520590
goto end;
521591
case GLOB_NOMATCH:
522-
errmsg("No OCXL devices found in '%s' with pattern '%s'", DEVICE_PATH, pattern);
523592
ret = OCXL_NO_DEV;
593+
errmsg(NULL, ret, "No OCXL devices found in '%s' with pattern '%s'", DEVICE_PATH, pattern);
524594
goto end;
525595
default:
526-
errmsg("Glob error %d while listing AFUs", rc);
596+
errmsg(NULL, ret, "Glob error %d while listing AFUs", rc);
527597
goto end;
528598
}
529599

@@ -590,8 +660,9 @@ ocxl_err ocxl_afu_attach(ocxl_afu_h afu)
590660
#endif
591661

592662
if (ioctl(my_afu->fd, OCXL_IOCTL_ATTACH, &attach_args)) {
593-
errmsg("OCXL_IOCTL_ATTACH failed %d:%s", errno, strerror(errno));
594-
return OCXL_INTERNAL_ERROR;
663+
ocxl_err rc = OCXL_INTERNAL_ERROR;
664+
errmsg(my_afu, rc, "OCXL_IOCTL_ATTACH failed %d:%s", errno, strerror(errno));
665+
return rc;
595666
}
596667

597668
return OCXL_OK;

src/include/libocxl.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
extern "C" {
2727
#endif
2828

29+
#define OCXL_NO_MESSAGES 0
30+
#define OCXL_ERRORS (1 << 0)
31+
#define OCXL_TRACING (1 << 1)
32+
2933

3034
/**
3135
* Defines the endianess of an AFU MMIO area
@@ -71,6 +75,7 @@ typedef enum {
7175
OCXL_ALREADY_DONE = -6, /**< The action requested has already been performed */
7276
OCXL_OUT_OF_BOUNDS = -7, /**< The action requested falls outside the permitted area */
7377
OCXL_NO_MORE_CONTEXTS = -8, /**< No more contexts can be opened on the AFU */
78+
/* Adding something? Update setup.c: ocxl_err_to_string too */
7479
} ocxl_err;
7580

7681
/**
@@ -119,9 +124,9 @@ typedef struct ocxl_event {
119124

120125

121126
/* setup.c */
122-
void ocxl_want_verbose_errors(int verbose);
123-
void ocxl_want_tracing(int want_tracing);
124-
void ocxl_set_errmsg_filehandle(FILE * handle);
127+
void ocxl_enable_messages(uint64_t sources);
128+
void ocxl_set_error_message_handler(void (*handler)(ocxl_err error, const char *message));
129+
const char *ocxl_err_to_string(ocxl_err err);
125130

126131
/* afu.c */
127132
/* AFU getters */
@@ -139,6 +144,9 @@ ocxl_err ocxl_afu_open_specific(const char *name, const char *physical_function,
139144
ocxl_err ocxl_afu_open_from_dev(const char *path, ocxl_afu_h * afu);
140145
ocxl_err ocxl_afu_open(const char *name, ocxl_afu_h * afu);
141146
ocxl_err ocxl_afu_open_by_id(const char *name, uint8_t card_index, int16_t afu_index, ocxl_afu_h * afu);
147+
void ocxl_afu_enable_messages(ocxl_afu_h afu, uint64_t sources);
148+
void ocxl_afu_set_error_message_handler(ocxl_afu_h afu, void (*handler)(ocxl_afu_h afu, ocxl_err error,
149+
const char *message));
142150
ocxl_err ocxl_afu_close(ocxl_afu_h afu);
143151
ocxl_err ocxl_afu_attach(ocxl_afu_h afu);
144152

0 commit comments

Comments
 (0)