depot · robstolarz · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · cursor
diff --git a/pkg/cmd/ci/run.go b/pkg/cmd/ci/run.go
@@ -1,6 +1,7 @@
 package ci
 
 import (
+	"context"
 	"crypto/sha256"
 	"encoding/json"
 	"fmt"
@@ -9,6 +10,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"strings"
+	"time"
 
 	"github.com/depot/cli/pkg/api"
 	"github.com/depot/cli/pkg/config"
@@ -49,7 +51,7 @@ This command is in beta and subject to change.`,
   # Run a job and connect to its terminal via SSH
   depot ci run --workflow .depot/workflows/ci.yml --job build --ssh
 
-  # Debug with tmate after a specific step
+  # Debug with SSH after a specific step (pauses workflow until you continue)
   depot ci run --workflow .depot/workflows/ci.yml --job build --ssh-after-step 3`,
 		RunE: func(cmd *cobra.Command, args []string) error {
 			if workflowPath == "" {
@@ -166,10 +168,10 @@ This command is in beta and subject to change.`,
 				}
 			}
 
-			// Insert tmate debug step if requested
+			// Insert debug pause step if requested
 			if sshAfterStep > 0 {
 				jobName := jobNames[0]
-				if err := injectTmateStep(jobs, jobName, sshAfterStep, patch != nil); err != nil {
+				if err := injectDebugStep(jobs, jobName, sshAfterStep, patch != nil); err != nil {
 					return err
 				}
 			}
@@ -184,7 +186,7 @@ This command is in beta and subject to change.`,
 				fmt.Printf("Checking out commit: %s\n", patch.mergeBase)
 			}
 			if sshAfterStep > 0 {
-				fmt.Printf("Inserting tmate step after step %d\n", sshAfterStep)
+				fmt.Printf("Inserting debug step after step %d\n", sshAfterStep)
 			}
 			fmt.Println()
 
@@ -213,12 +215,34 @@ This command is in beta and subject to change.`,
 			fmt.Printf("Run: %s\n", resp.RunId)
 			fmt.Println()
 
-			if ssh {
-				fmt.Printf("Waiting for job to start and connecting via SSH...\n")
+			if sshAfterStep > 0 || ssh {
+				if sshAfterStep > 0 {
+					fmt.Printf("Waiting for debug step to activate...\n")
+				} else {
+					fmt.Printf("Waiting for job to start...\n")
+				}
 				sandboxID, sessionID, err := waitForSandbox(ctx, tokenVal, orgID, resp.RunId, jobNames[0], "")
 				if err != nil {
 					return err
 				}
+
+				// When --ssh-after-step is used, wait for the debug step to
+				// actually be running before connecting, so the user lands in
+				// the sandbox after step N has completed.
+				if sshAfterStep > 0 {
+					fmt.Fprintf(os.Stderr, "Waiting for step %d to complete...\n", sshAfterStep)
+					if err := waitForLogMarker(ctx, tokenVal, orgID, resp.RunId, jobNames[0], "::depot-ssh-ready::"); err != nil {
+						fmt.Fprintf(os.Stderr, "Warning: could not confirm debug step is active: %v\n", err)
+						fmt.Fprintf(os.Stderr, "Connecting anyway...\n")
+					}
+				}
+
+				if sshAfterStep > 0 {
+					fmt.Fprintf(os.Stderr, "Run 'touch /tmp/depot-continue' to resume the workflow. (Your session will not end.)\n")
+				}
+				if !helpers.IsTerminal() {
+					return printSSHInfo(resp.RunId, sandboxID, sessionID, "")
+				}
 				return pty.Run(ctx, pty.SessionOptions{
 					Token:     tokenVal,
 					OrgID:     orgID,
@@ -242,7 +266,7 @@ This command is in beta and subject to change.`,
 	cmd.Flags().StringVar(&token, "token", "", "Depot API token")
 	cmd.Flags().StringVar(&workflowPath, "workflow", "", "Path to workflow YAML file")
 	cmd.Flags().StringSliceVar(&jobNames, "job", nil, "Job name(s) to run (repeatable; omit to run all)")
-	cmd.Flags().IntVar(&sshAfterStep, "ssh-after-step", 0, "1-based step index to insert a tmate debug step after (requires single --job)")
+	cmd.Flags().IntVar(&sshAfterStep, "ssh-after-step", 0, "1-based step index to pause and connect via SSH after (requires single --job)")
 	cmd.Flags().BoolVar(&ssh, "ssh", false, "Start the run and connect to the job's sandbox via interactive terminal (requires single --job)")
 
 	cmd.AddCommand(NewCmdRunList())
@@ -387,7 +411,7 @@ echo "Patch applied successfully"`, cacheKey, cacheBaseURL),
 	job["steps"] = newSteps
 }
 
-func injectTmateStep(jobs map[string]interface{}, jobName string, afterStep int, patchInjected bool) error {
+func injectDebugStep(jobs map[string]interface{}, jobName string, afterStep int, patchInjected bool) error {
 	jobRaw, ok := jobs[jobName]
 	if !ok {
 		return fmt.Errorf("job %q not found", jobName)
@@ -405,11 +429,12 @@ func injectTmateStep(jobs map[string]interface{}, jobName string, afterStep int,
 		return fmt.Errorf("job %q steps is not a list", jobName)
 	}
 
-	tmateStep := map[string]interface{}{
-		"uses": "mxschmitt/action-tmate@v3",
-		"with": map[string]interface{}{
-			"limit-access-to-actor": "false",
-		},
+	debugStep := map[string]interface{}{
+		"name": "Depot SSH Debug",
+		"run": "echo '::depot-ssh-ready::'\n" +
+			"echo 'SSH session active. Run: touch /tmp/depot-continue to resume workflow.'\n" +
+			"while [ ! -f /tmp/depot-continue ]; do sleep 5; done\n" +
+			"echo 'Continuing workflow...'",
 	}
 
 	insertAt := afterStep
@@ -439,7 +464,7 @@ func injectTmateStep(jobs map[string]interface{}, jobName string, afterStep int,
 
 	newSteps := make([]interface{}, 0, len(steps)+1)
 	newSteps = append(newSteps, steps[:insertAt]...)
-	newSteps = append(newSteps, tmateStep)
+	newSteps = append(newSteps, debugStep)
 	newSteps = append(newSteps, steps[insertAt:]...)
 	job["steps"] = newSteps
 
@@ -483,6 +508,68 @@ func formatStatus(s civ1.CIRunStatus) string {
 	}
 }
 
+// waitForLogMarker polls the job attempt logs until a line containing marker
+// appears. This is used to detect when the injected debug step is running.
+func waitForLogMarker(ctx context.Context, token, orgID, runID, jobKey, marker string) error {
+	const pollInterval = 3 * time.Second
+
+	for {
+
+		// Resolve the latest attempt ID for the job.
+		resp, err := api.CIGetRunStatus(ctx, token, orgID, runID)
+		if err != nil {
+			// Transient error, keep polling.
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-time.After(pollInterval):
+			}
+			continue
+		}
+
+		targetJob, err := findJob(resp, jobKey, "")
+		if err != nil {
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-time.After(pollInterval):
+			}
+			continue
+		}
+
+		attempt := latestAttempt(targetJob)
+		if attempt == nil {
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-time.After(pollInterval):
+			}
+			continue
+		}
+
+		// Early exit if the job has already completed — the marker will never appear.
+		switch attempt.Status {
+		case "finished", "failed", "cancelled":
+			return fmt.Errorf("job completed before debug step was reached (status: %s)", attempt.Status)
+		}
+
+		lines, err := api.CIGetJobAttemptLogs(ctx, token, orgID, attempt.AttemptId)
+		if err == nil {
+			for _, line := range lines {
+				if strings.Contains(line.Body, marker) {
+					return nil
+				}
+			}
+		}
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(pollInterval):
+		}
+	}
+}
+
 func NewCmdRunList() *cobra.Command {
 	var (
 		orgID    string

diff --git a/pkg/cmd/ci/ssh.go b/pkg/cmd/ci/ssh.go
@@ -73,7 +73,7 @@ This command is in beta and subject to change.`,
 			}
 
 			if info || !helpers.IsTerminal() {
-				return printSSHInfo(sandboxID, sessionID, output)
+				return printSSHInfo(runID, sandboxID, sessionID, output)
 			}
 
 			return pty.Run(ctx, pty.SessionOptions{
@@ -208,17 +208,16 @@ func findJob(resp *civ1.GetRunStatusResponse, jobKey, originalID string) (*civ1.
 		return nil, &retryableJobError{msg: fmt.Sprintf("run %s has no jobs yet", resp.RunId)}
 	}
 
-	// Match by job key (--job flag).
+	// Match by job key (--job flag): prefer exact match, then fall back to
+	// short name (after colon) for inline workflow keys like "_inline_0.yaml:e2e".
 	if jobKey != "" {
 		for _, j := range allJobs {
 			if j.JobKey == jobKey {
 				return j, nil
 			}
 		}
-		// Inline workflows get prefixed keys (e.g. "_inline_0.yaml:lint_typecheck"),
-		// so fall back to a suffix match when the user passes just the job name.
 		for _, j := range allJobs {
-			if strings.HasSuffix(j.JobKey, ":"+jobKey) {
+			if i := strings.IndexByte(j.JobKey, ':'); i >= 0 && j.JobKey[i+1:] == jobKey {
 				return j, nil
 			}
 		}
@@ -270,12 +269,13 @@ func workflowErrorMessage(resp *civ1.GetRunStatusResponse) string {
 	return ""
 }
 
-func printSSHInfo(sandboxID, sessionID, output string) error {
+func printSSHInfo(runID, sandboxID, sessionID, output string) error {
 	if output == "json" {
 		enc := json.NewEncoder(os.Stdout)
 		enc.SetIndent("", "  ")
 		return enc.Encode(map[string]string{
 			"host":        "api.depot.dev",
+			"run_id":      runID,
 			"sandbox_id":  sandboxID,
 			"session_id":  sessionID,
 			"ssh_command": fmt.Sprintf("ssh %s@api.depot.dev", sandboxID),
@@ -284,8 +284,12 @@ func printSSHInfo(sandboxID, sessionID, output string) error {
 
 	fmt.Printf("Host:     api.depot.dev\n")
 	fmt.Printf("User:     %s\n", sandboxID)
-	fmt.Printf("Password: Use your Depot API token ($DEPOT_TOKEN)\n")
+	fmt.Printf("Password: Your Depot API token ($DEPOT_TOKEN)\n")
 	fmt.Println()
-	fmt.Printf("Connect:  ssh %s@api.depot.dev\n", sandboxID)
+	fmt.Printf("Connect interactively:\n")
+	fmt.Printf("  depot ci ssh %s\n", runID)
+	fmt.Println()
+	fmt.Printf("Or via SSH directly:\n")
+	fmt.Printf("  ssh %s@api.depot.dev\n", sandboxID)
 	return nil
 }