Skip to content

Commit ae576b4

Browse files
committed
Merge branch 'master' into task-wrapper
2 parents 7ec5a75 + 7530a75 commit ae576b4

23 files changed

Lines changed: 463 additions & 144 deletions

batch_job/src/batch_job_info.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,6 @@ struct batch_job_info *batch_job_info_create()
1616

1717
void batch_job_info_delete(struct batch_job_info *info)
1818
{
19+
free(info->schedd);
1920
free(info);
2021
}

batch_job/src/batch_job_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ struct batch_job_info {
2020
int exit_signal; /**< The signal by which the job was killed, if it exited abnormally. */
2121
int disk_allocation_exhausted; /**< Non-zero if the job filled its loop device allocation to capacity, zero otherwise */
2222
long log_pos; /**< Last read position in the log file, for ftell and fseek. (only for batch_queue_cluster) */
23+
char *schedd; /**< Condor schedd host from job log alias (e.g. from "Job submitted from host: <...&alias=...>"), if known. */
2324
};
2425

2526
/** Create a new batch_job_info struct.

batch_job/src/batch_queue_condor.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,19 @@ static batch_queue_id_t batch_queue_condor_wait(struct batch_queue *q, struct ba
301301

302302
if (type == 0) {
303303
info->submitted = current;
304+
/* Parse schedd from "Job submitted from host: <...&alias=condorfe.crc.nd.edu&...>" */
305+
const char *alias = strstr(line, "alias=");
306+
if (alias) {
307+
alias += 6; /* skip "alias=" */
308+
const char *end = strpbrk(alias, "&>");
309+
if (end && end > alias) {
310+
size_t len = (size_t)(end - alias);
311+
free(info->schedd);
312+
info->schedd = xxmalloc(len + 1);
313+
memcpy(info->schedd, alias, len);
314+
info->schedd[len] = '\0';
315+
}
316+
}
304317
} else if (type == 1) {
305318
info->started = current;
306319
debug(D_BATCH, "job %" PRIbjid " running now", jobid);
@@ -360,7 +373,18 @@ static batch_queue_id_t batch_queue_condor_wait(struct batch_queue *q, struct ba
360373

361374
static int batch_queue_condor_remove(struct batch_queue *q, batch_queue_id_t jobid, batch_queue_remove_mode_t mode)
362375
{
363-
char *command = string_format("condor_rm %" PRIbjid, jobid);
376+
struct batch_job_info *info = itable_lookup(q->job_table, jobid);
377+
char *schedd = NULL;
378+
if (info) {
379+
schedd = info->schedd;
380+
}
381+
382+
char *command = NULL;
383+
if (schedd) {
384+
command = string_format("condor_rm -name %s %" PRIbjid, schedd, jobid);
385+
} else {
386+
command = string_format("condor_rm %" PRIbjid, jobid);
387+
}
364388

365389
debug(D_BATCH, "%s", command);
366390
FILE *file = popen(command, "r");

batch_job/src/vine_factory.c

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,65 @@ static int64_t factory_timeout = 0;
133133

134134
static char *factory_name = NULL;
135135

136+
/* Port range for worker-worker transfers (e.g. "10000:11000"). Passed as --transfer-port to workers. */
137+
static char *transfer_port_range = NULL;
138+
139+
/* Returns 1 if valid, 0 if invalid (prints error to stderr). */
140+
static int validate_transfer_port_range(const char *range, const char *context)
141+
{
142+
int min_port, max_port;
143+
char *r, *ptr, *end;
144+
145+
if (!range || !*range) {
146+
fprintf(stderr, "vine_factory: %stransfer-port must be non-empty (e.g. 10000 or 10000:11000)\n", context ? context : "");
147+
return 0;
148+
}
149+
150+
r = xxstrdup(range);
151+
ptr = strtok(r, ":");
152+
if (!ptr) {
153+
fprintf(stderr, "vine_factory: %sinvalid transfer-port range \"%s\" (expected PORT or PORT_MIN:PORT_MAX)\n", context ? context : "", range);
154+
free(r);
155+
return 0;
156+
}
157+
158+
errno = 0;
159+
min_port = (int)strtol(ptr, &end, 10);
160+
if (*end != '\0' || errno != 0 || min_port < 1 || min_port > 65535) {
161+
fprintf(stderr, "vine_factory: %stransfer-port min must be 1-65535, got \"%s\"\n", context ? context : "", ptr);
162+
free(r);
163+
return 0;
164+
}
165+
max_port = min_port;
166+
167+
ptr = strtok(NULL, ":");
168+
if (ptr) {
169+
errno = 0;
170+
max_port = (int)strtol(ptr, &end, 10);
171+
if (*end != '\0' || errno != 0 || max_port < 1 || max_port > 65535) {
172+
fprintf(stderr, "vine_factory: %stransfer-port max must be 1-65535, got \"%s\"\n", context ? context : "", ptr);
173+
free(r);
174+
return 0;
175+
}
176+
}
177+
178+
ptr = strtok(NULL, ":");
179+
if (ptr) {
180+
fprintf(stderr, "vine_factory: %sinvalid transfer-port range \"%s\" (expected PORT or PORT_MIN:PORT_MAX)\n", context ? context : "", range);
181+
free(r);
182+
return 0;
183+
}
184+
185+
if (min_port > max_port) {
186+
fprintf(stderr, "vine_factory: %stransfer-port min (%d) must not exceed max (%d)\n", context ? context : "", min_port, max_port);
187+
free(r);
188+
return 0;
189+
}
190+
191+
free(r);
192+
return 1;
193+
}
194+
136195
struct batch_queue *queue = 0;
137196

138197
// Whether workers should use ssl. If using the catalog server and the manager
@@ -498,11 +557,15 @@ static int submit_worker( struct batch_queue *queue )
498557
if(manual_ssl_option) ADD_ARG1(cmd," --ssl");
499558
if(single_shot) ADD_ARG1(cmd," --single-shot");
500559
if(task_wrapper) ADD_ARG2(cmd," --task-wrapper \"%s\"",task_wrapper);
560+
if(transfer_port_range) ADD_ARG1(cmd," --transfer-port %s",transfer_port_range);
501561
if(extra_worker_args) ADD_ARG2(cmd," %s",extra_worker_args);
502562

503563
char *features_string = make_features_string(features_table);
504564
ADD_ARG2(cmd," %s",features_string);
505565
free(features_string);
566+
if(transfer_port_range) {
567+
free(transfer_port_arg);
568+
}
506569

507570
if(wrapper_command) {
508571
// Note that we don't use string_wrap_command here,
@@ -886,6 +949,7 @@ int read_config_file(const char *config_file) {
886949

887950
assign_new_value(new_foremen_regex, foremen_regex, foremen-name, const char *, JX_STRING, string_value)
888951
assign_new_value(new_extra_worker_args, extra_worker_args, worker-extra-options, const char *, JX_STRING, string_value)
952+
assign_new_value(new_transfer_port_range, transfer_port_range, transfer-port, const char *, JX_STRING, string_value)
889953

890954
assign_new_value(new_condor_requirements, condor_requirements, condor-requirements, const char *, JX_STRING, string_value)
891955

@@ -924,6 +988,14 @@ int read_config_file(const char *config_file) {
924988
error_found = 1;
925989
}
926990

991+
if(new_transfer_port_range) {
992+
char ctx[PATH_MAX + 4];
993+
snprintf(ctx, sizeof(ctx), "%s: ", config_file);
994+
if (!validate_transfer_port_range(new_transfer_port_range, ctx)) {
995+
error_found = 1;
996+
}
997+
}
998+
927999
if(error_found) {
9281000
goto end;
9291001
}
@@ -961,6 +1033,11 @@ int read_config_file(const char *config_file) {
9611033
extra_worker_args = xxstrdup(new_extra_worker_args);
9621034
}
9631035

1036+
if(new_transfer_port_range != transfer_port_range) {
1037+
free(transfer_port_range);
1038+
transfer_port_range = new_transfer_port_range ? xxstrdup(new_transfer_port_range) : NULL;
1039+
}
1040+
9641041
if(new_condor_requirements != condor_requirements) {
9651042
free(condor_requirements);
9661043
condor_requirements = xxstrdup(new_condor_requirements);
@@ -1019,6 +1096,10 @@ int read_config_file(const char *config_file) {
10191096
fprintf(stdout, "worker-extra-options: %s", extra_worker_args);
10201097
}
10211098

1099+
if(transfer_port_range) {
1100+
fprintf(stdout, "transfer-port: %s\n", transfer_port_range);
1101+
}
1102+
10221103
if(workers_blocked) {
10231104
fprintf(stdout, "workers-blocked: %s", workers_blocked);
10241105
}
@@ -1073,7 +1154,7 @@ static void mainloop( struct batch_queue *queue )
10731154
{
10741155
factory_timeout_start = time(0);
10751156
} else {
1076-
// check to see if factory timeout is triggered, factory timeout will be 0 if flag isn't set
1157+
/* check to see if factory timeout is triggered, factory timeout will be 0 if flag isn't set */
10771158
if(factory_timeout > 0)
10781159
{
10791160
if(time(0) - factory_timeout_start > factory_timeout) {
@@ -1289,6 +1370,7 @@ static void show_help(const char *cmd)
12891370
printf("\nWorker environment options:\n");
12901371
printf(" %-30s Environment variable to add to worker.\n", "--env=<variable=value>");
12911372
printf(" %-30s Extra options to give to worker.\n", "-E,--extra-options=<options>");
1373+
printf(" %-30s Port range for worker-worker transfers (e.g. 10000:11000). Passed as --transfer-port.\n", "--transfer-port=<port|min:max>");
12921374
printf(" %-30s Alternate binary instead of vine_worker.\n", "--worker-binary=<file>");
12931375
printf(" %-30s Wrap worker with this command prefix.\n","--wrapper");
12941376
printf(" %-30s Wrap tasks with this command-prefix,\n","--task-wrapper");
@@ -1336,6 +1418,7 @@ enum{ LONG_OPT_CORES = 255,
13361418
LONG_OPT_DEBUG_WORKERS,
13371419
LONG_OPT_DISABLE_AFS_CHECK,
13381420
LONG_OPT_SINGLE_SHOT,
1421+
LONG_OPT_TRANSFER_PORT,
13391422
};
13401423

13411424
static const struct option long_options[] = {
@@ -1388,6 +1471,7 @@ static const struct option long_options[] = {
13881471
{"tls-sni", required_argument, 0, LONG_OPT_TLS_SNI},
13891472
{"factory-name",required_argument, 0, LONG_OPT_FACTORY_NAME},
13901473
{"single-shot", no_argument, 0, LONG_OPT_SINGLE_SHOT},
1474+
{"transfer-port", required_argument, 0, LONG_OPT_TRANSFER_PORT},
13911475
{0,0,0,0}
13921476
};
13931477

@@ -1593,6 +1677,13 @@ int main(int argc, char *argv[])
15931677
case LONG_OPT_SINGLE_SHOT:
15941678
single_shot = 1;
15951679
break;
1680+
case LONG_OPT_TRANSFER_PORT:
1681+
if (!validate_transfer_port_range(optarg, "")) {
1682+
return EXIT_FAILURE;
1683+
}
1684+
free(transfer_port_range);
1685+
transfer_port_range = xxstrdup(optarg);
1686+
break;
15961687
default:
15971688
show_help(argv[0]);
15981689
return EXIT_FAILURE;

doc/man/m4/vine_factory.m4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ OPTION_ARG_LONG(env,variable=value)
9898
Environment variable to add to worker.
9999
OPTION_ARG(E,extra-options,options)
100100
Extra options to give to worker.
101+
OPTION_ARG_LONG(transfer-port,port) Port range for worker-worker transfers (e.g. 10000:11000).
101102
OPTION_ARG_LONG(worker-binary,file)
102103
Alternate binary instead of vine_worker.
103104
OPTION_ARG_LONG(wrapper,cmd)
@@ -180,6 +181,7 @@ workers-per-cycle
180181
task-per-worker
181182
timeout
182183
worker-extra-options
184+
transfer-port
183185
condor-requirements
184186
cores
185187
memory

doc/man/md/vine_factory.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ Worker environment options:
120120
Environment variable to add to worker.
121121
- **-E**,**--extra-options=_&lt;options&gt;_**<br />
122122
Extra options to give to worker.
123+
- **--transfer-port=_&lt;port&gt;_**<br /> Port range for worker-worker transfers (e.g. 10000:11000).
123124
- **--worker-binary=_&lt;file&gt;_**<br />
124125
Alternate binary instead of vine_worker.
125126
- **--wrapper=_&lt;cmd&gt;_**<br />
@@ -202,6 +203,7 @@ workers-per-cycle
202203
task-per-worker
203204
timeout
204205
worker-extra-options
206+
transfer-port
205207
condor-requirements
206208
cores
207209
memory

doc/manuals/catalog/index.md

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
Catalog servers function as connection points for tools that need to share
66
information and interact remotely. Various services and tools send periodic
77
updates to a catalog server to advertise their presence and vital details such
8-
as addresses, resources, and performance. Tools like `chirp_status` and
9-
`work_queue_status` query the server to displays servers that are currently
8+
as addresses, resources, and performance. Tools like `vine_status`, `work_queue_status`, and `chirp_status` and
9+
query the catalog to displays service that are currently
1010
running. Catalog updates are sent via TCP or UDP, and the catalog server exposes a
1111
JSON interface to view status and make queries.
1212

@@ -19,11 +19,13 @@ automatic backup) at Notre Dame:
1919

2020
The default view for a catalog server is a human-readable HTML summary.
2121
Machine-readable data is also available as JSON, text, XML, or ClassAds. Many
22-
parts of cctools make use of a catalog server internally. Chirp servers send
22+
parts of cctools make use of a catalog server internally. [Chirp](../chirp) servers send
2323
regular catalog updates indicating the host system's load, available disk
24-
space, cctools version, etc. Work Queue managers also advertise their projects
25-
through the catalog. When a worker starts, it can query the catalog to
26-
automatically discover a manager to contact.
24+
space, cctools version, etc.
25+
[TaskVine](../taskvine) and [Work Queue](../workqueue) managers advertise
26+
currently running applications, indicating number of tasks, workers, performance, etc.
27+
When workers start, they can query the catalog to
28+
automatically discover which manager to contact.
2729

2830
## Specifying Catalog Servers
2931

@@ -33,15 +35,14 @@ delimited list of servers to use. Each may optionally include a port number.
3335
If no port is specified, the value of the environment variable `CATALOG_PORT`
3436
is used, or the default of port 9097. If no catalog server is given on the
3537
command line, the `CATALOG_HOST` environment variable is used. If that is
36-
unset, the default of `catalog.cse.nd.edu,backup-catalog.cse.nd.edu` This
37-
could be written more verbosely as `catalog.cse.nd.edu:9097,backup-catalog.cse.nd.edu:9097` assuming the catalog port was not set in the
38-
environment.
38+
unset, the default of `catalog.cse.nd.edu,backup-catalog.cse.nd.edu` is used,
39+
assuming a default port of 9097.
3940

4041
## Querying Catalog Servers
4142

4243
There are several ways to query a catalog server. If you are querying
43-
specifically for Chirp servers or Work Queue applications, then use the
44-
`chirp_status` or `work_queue_status` tools, which query the server and
44+
specifically for Chirp servers or TaskVine applications, then use the
45+
`chirp_status` or `vine_status` tools, which query the server and
4546
display fields specific for those uses.
4647

4748
To view all kinds of records in raw JSON format, use the `catalog_query` tool.
@@ -204,5 +205,5 @@ For more information, please see [Getting Help](../help.md) or visit the [Cooper
204205

205206
## Copyright
206207

207-
CCTools is Copyright (C) 2022 The University of Notre Dame. This software is distributed under the GNU General Public License Version 2. See the file COPYING for
208+
CCTools is Copyright (C) 2026 The University of Notre Dame. This software is distributed under the GNU General Public License Version 2. See the file COPYING for
208209
details.

dttools/src/debug.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,14 @@ struct fatal_callback {
115115

116116
struct fatal_callback *fatal_callback_list = 0;
117117

118+
static void debug_signal_handler(int sig)
119+
{
120+
signal(sig, SIG_DFL);
121+
debug(D_ERROR, "Fatal error: signal %d received", sig);
122+
debug_backtrace;
123+
_exit(1);
124+
}
125+
118126
int debug_flags_set(const char *flagname)
119127
{
120128
struct flag_info *i;
@@ -280,6 +288,8 @@ void fatal(const char *fmt, ...)
280288
struct fatal_callback *f;
281289
va_list args;
282290

291+
debug_backtrace;
292+
283293
va_start(args, fmt);
284294
do_debug(D_FATAL, fmt, args);
285295
va_end(args);
@@ -323,6 +333,15 @@ void debug_config_file(const char *path)
323333
fprintf(stderr, "could not set debug file '%s': %s", path, strerror(errno));
324334
exit(EXIT_FAILURE);
325335
}
336+
337+
char *var = getenv("CCTOOLS_DEBUG_CAPTURE_SIGNALS");
338+
if (var) {
339+
signal(SIGSEGV, debug_signal_handler);
340+
signal(SIGILL, debug_signal_handler);
341+
signal(SIGTERM, debug_signal_handler);
342+
signal(SIGINT, debug_signal_handler);
343+
signal(SIGQUIT, debug_signal_handler);
344+
}
326345
}
327346

328347
void debug_config(const char *name)

dttools/src/debug.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ unless it has the flags D_NOTICE or D_FATAL. For example, a main program might
3535
#include "int_sizes.h"
3636

3737
#include <unistd.h>
38+
#include <execinfo.h>
3839

3940
#include <sys/types.h>
4041

@@ -264,4 +265,30 @@ void debug_close(void);
264265
*/
265266
#define LDEBUG(fmt, ...) debug(D_DEBUG, "%s:%s:%d[%s]: " fmt, __func__, __FILE__, __LINE__, CCTOOLS_SOURCE, __VA_ARGS__)
266267

268+
269+
#define DEBUG_BT_SIZE 100
270+
271+
/* Use offset info (+0x...) with addr2line -f -e [exe or .so] */
272+
#define debug_backtrace \
273+
{ debug(D_ERROR, "%s:%s:%d[%s]", __func__, __FILE__, __LINE__, CCTOOLS_SOURCE); \
274+
void *buffer[DEBUG_BT_SIZE]; \
275+
size_t nptrs = backtrace(buffer, DEBUG_BT_SIZE); \
276+
char **strings = backtrace_symbols(buffer, nptrs); \
277+
if (!strings) { \
278+
debug(D_ERROR, "%s", "No backtrace available."); \
279+
} else { \
280+
for (size_t bt_ct = 0; bt_ct < nptrs; bt_ct++) { \
281+
debug(D_ERROR, "%s", strings[bt_ct]); \
282+
} \
283+
} \
284+
free(strings); \
285+
} // generate a core if possible
286+
287+
288+
#define debug_assert(cond)\
289+
if (!(cond)) {\
290+
debug(D_ERROR, "Assertion failed: %s", #cond);\
291+
debug_backtrace;\
292+
abort(); }
293+
267294
#endif

0 commit comments

Comments
 (0)