Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 101 additions & 14 deletions pkg/cmd/ci/run.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package ci

import (
"context"
"crypto/sha256"
"encoding/json"
"fmt"
Expand All @@ -9,6 +10,7 @@ import (
"path/filepath"
"regexp"
"strings"
"time"

"github.com/depot/cli/pkg/api"
"github.com/depot/cli/pkg/config"
Expand Down Expand Up @@ -49,7 +51,7 @@ This command is in beta and subject to change.`,
# Run a job and connect to its terminal via SSH
depot ci run --workflow .depot/workflows/ci.yml --job build --ssh

# Debug with tmate after a specific step
# Debug with SSH after a specific step (pauses workflow until you continue)
depot ci run --workflow .depot/workflows/ci.yml --job build --ssh-after-step 3`,
RunE: func(cmd *cobra.Command, args []string) error {
if workflowPath == "" {
Expand Down Expand Up @@ -166,10 +168,10 @@ This command is in beta and subject to change.`,
}
}

// Insert tmate debug step if requested
// Insert debug pause step if requested
if sshAfterStep > 0 {
jobName := jobNames[0]
if err := injectTmateStep(jobs, jobName, sshAfterStep, patch != nil); err != nil {
if err := injectDebugStep(jobs, jobName, sshAfterStep, patch != nil); err != nil {
return err
}
}
Expand All @@ -184,7 +186,7 @@ This command is in beta and subject to change.`,
fmt.Printf("Checking out commit: %s\n", patch.mergeBase)
}
if sshAfterStep > 0 {
fmt.Printf("Inserting tmate step after step %d\n", sshAfterStep)
fmt.Printf("Inserting debug step after step %d\n", sshAfterStep)
}
fmt.Println()

Expand Down Expand Up @@ -213,12 +215,34 @@ This command is in beta and subject to change.`,
fmt.Printf("Run: %s\n", resp.RunId)
fmt.Println()

if ssh {
fmt.Printf("Waiting for job to start and connecting via SSH...\n")
if sshAfterStep > 0 || ssh {
if sshAfterStep > 0 {
fmt.Printf("Waiting for debug step to activate...\n")
} else {
fmt.Printf("Waiting for job to start...\n")
}
sandboxID, sessionID, err := waitForSandbox(ctx, tokenVal, orgID, resp.RunId, jobNames[0], "")
if err != nil {
return err
}

// When --ssh-after-step is used, wait for the debug step to
// actually be running before connecting, so the user lands in
// the sandbox after step N has completed.
if sshAfterStep > 0 {
fmt.Fprintf(os.Stderr, "Waiting for step %d to complete...\n", sshAfterStep)
if err := waitForLogMarker(ctx, tokenVal, orgID, resp.RunId, jobNames[0], "::depot-ssh-ready::"); err != nil {
fmt.Fprintf(os.Stderr, "Warning: could not confirm debug step is active: %v\n", err)
fmt.Fprintf(os.Stderr, "Connecting anyway...\n")
}
}

if sshAfterStep > 0 {
fmt.Fprintf(os.Stderr, "Run 'touch /tmp/depot-continue' to resume the workflow. (Your session will not end.)\n")
}
if !helpers.IsTerminal() {
return printSSHInfo(resp.RunId, sandboxID, sessionID, "")
}
return pty.Run(ctx, pty.SessionOptions{
Token: tokenVal,
OrgID: orgID,
Expand All @@ -242,7 +266,7 @@ This command is in beta and subject to change.`,
cmd.Flags().StringVar(&token, "token", "", "Depot API token")
cmd.Flags().StringVar(&workflowPath, "workflow", "", "Path to workflow YAML file")
cmd.Flags().StringSliceVar(&jobNames, "job", nil, "Job name(s) to run (repeatable; omit to run all)")
cmd.Flags().IntVar(&sshAfterStep, "ssh-after-step", 0, "1-based step index to insert a tmate debug step after (requires single --job)")
cmd.Flags().IntVar(&sshAfterStep, "ssh-after-step", 0, "1-based step index to pause and connect via SSH after (requires single --job)")
cmd.Flags().BoolVar(&ssh, "ssh", false, "Start the run and connect to the job's sandbox via interactive terminal (requires single --job)")

cmd.AddCommand(NewCmdRunList())
Expand Down Expand Up @@ -387,7 +411,7 @@ echo "Patch applied successfully"`, cacheKey, cacheBaseURL),
job["steps"] = newSteps
}

func injectTmateStep(jobs map[string]interface{}, jobName string, afterStep int, patchInjected bool) error {
func injectDebugStep(jobs map[string]interface{}, jobName string, afterStep int, patchInjected bool) error {
jobRaw, ok := jobs[jobName]
if !ok {
return fmt.Errorf("job %q not found", jobName)
Expand All @@ -405,11 +429,12 @@ func injectTmateStep(jobs map[string]interface{}, jobName string, afterStep int,
return fmt.Errorf("job %q steps is not a list", jobName)
}

tmateStep := map[string]interface{}{
"uses": "mxschmitt/action-tmate@v3",
"with": map[string]interface{}{
"limit-access-to-actor": "false",
},
debugStep := map[string]interface{}{
"name": "Depot SSH Debug",
"run": "echo '::depot-ssh-ready::'\n" +
"echo 'SSH session active. Run: touch /tmp/depot-continue to resume workflow.'\n" +
"while [ ! -f /tmp/depot-continue ]; do sleep 5; done\n" +
"echo 'Continuing workflow...'",
}

insertAt := afterStep
Expand Down Expand Up @@ -439,7 +464,7 @@ func injectTmateStep(jobs map[string]interface{}, jobName string, afterStep int,

newSteps := make([]interface{}, 0, len(steps)+1)
newSteps = append(newSteps, steps[:insertAt]...)
newSteps = append(newSteps, tmateStep)
newSteps = append(newSteps, debugStep)
newSteps = append(newSteps, steps[insertAt:]...)
job["steps"] = newSteps

Expand Down Expand Up @@ -483,6 +508,68 @@ func formatStatus(s civ1.CIRunStatus) string {
}
}

// waitForLogMarker polls the job attempt logs until a line containing marker
// appears. This is used to detect when the injected debug step is running.
func waitForLogMarker(ctx context.Context, token, orgID, runID, jobKey, marker string) error {
const pollInterval = 3 * time.Second

for {

// Resolve the latest attempt ID for the job.
resp, err := api.CIGetRunStatus(ctx, token, orgID, runID)
if err != nil {
// Transient error, keep polling.
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(pollInterval):
}
continue
}

targetJob, err := findJob(resp, jobKey, "")
if err != nil {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(pollInterval):
}
continue
}

attempt := latestAttempt(targetJob)
if attempt == nil {
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(pollInterval):
}
continue
}

// Early exit if the job has already completed — the marker will never appear.
switch attempt.Status {
case "finished", "failed", "cancelled":
return fmt.Errorf("job completed before debug step was reached (status: %s)", attempt.Status)
}

lines, err := api.CIGetJobAttemptLogs(ctx, token, orgID, attempt.AttemptId)
if err == nil {
for _, line := range lines {
if strings.Contains(line.Body, marker) {
return nil
}
}
}

select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(pollInterval):
}
}
}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing timeout causes indefinite hang in log polling

Medium Severity

waitForLogMarker has no timeout, unlike waitForSandbox which uses a 5-minute deadline. If the marker never appears and the job stays running (e.g., a preceding step hangs, the debug step isn't reached, or the API returns persistent errors that are silently retried), the CLI blocks indefinitely. The sandbox is already provisioned at this point, and the caller on line 234 treats errors as non-fatal warnings to "connect anyway" — but it never gets the chance because waitForLogMarker never returns. API errors are also swallowed and retried forever (unlike waitForSandbox which fails immediately on API errors), compounding the risk.

Fix in Cursor Fix in Web


func NewCmdRunList() *cobra.Command {
var (
orgID string
Expand Down
20 changes: 12 additions & 8 deletions pkg/cmd/ci/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ This command is in beta and subject to change.`,
}

if info || !helpers.IsTerminal() {
return printSSHInfo(sandboxID, sessionID, output)
return printSSHInfo(runID, sandboxID, sessionID, output)
}

return pty.Run(ctx, pty.SessionOptions{
Expand Down Expand Up @@ -208,17 +208,16 @@ func findJob(resp *civ1.GetRunStatusResponse, jobKey, originalID string) (*civ1.
return nil, &retryableJobError{msg: fmt.Sprintf("run %s has no jobs yet", resp.RunId)}
}

// Match by job key (--job flag).
// Match by job key (--job flag): prefer exact match, then fall back to
// short name (after colon) for inline workflow keys like "_inline_0.yaml:e2e".
if jobKey != "" {
for _, j := range allJobs {
if j.JobKey == jobKey {
return j, nil
}
}
// Inline workflows get prefixed keys (e.g. "_inline_0.yaml:lint_typecheck"),
// so fall back to a suffix match when the user passes just the job name.
for _, j := range allJobs {
if strings.HasSuffix(j.JobKey, ":"+jobKey) {
if i := strings.IndexByte(j.JobKey, ':'); i >= 0 && j.JobKey[i+1:] == jobKey {
return j, nil
}
}
Expand Down Expand Up @@ -270,12 +269,13 @@ func workflowErrorMessage(resp *civ1.GetRunStatusResponse) string {
return ""
}

func printSSHInfo(sandboxID, sessionID, output string) error {
func printSSHInfo(runID, sandboxID, sessionID, output string) error {
if output == "json" {
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(map[string]string{
"host": "api.depot.dev",
"run_id": runID,
"sandbox_id": sandboxID,
"session_id": sessionID,
"ssh_command": fmt.Sprintf("ssh %s@api.depot.dev", sandboxID),
Expand All @@ -284,8 +284,12 @@ func printSSHInfo(sandboxID, sessionID, output string) error {

fmt.Printf("Host: api.depot.dev\n")
fmt.Printf("User: %s\n", sandboxID)
fmt.Printf("Password: Use your Depot API token ($DEPOT_TOKEN)\n")
fmt.Printf("Password: Your Depot API token ($DEPOT_TOKEN)\n")
fmt.Println()
fmt.Printf("Connect: ssh %s@api.depot.dev\n", sandboxID)
fmt.Printf("Connect interactively:\n")
fmt.Printf(" depot ci ssh %s\n", runID)
fmt.Println()
fmt.Printf("Or via SSH directly:\n")
fmt.Printf(" ssh %s@api.depot.dev\n", sandboxID)
return nil
}
Loading