From 6f4d5cc243804c6bfff6ee416e0f316277f5d7fc Mon Sep 17 00:00:00 2001 From: Benjamin Wilhelm Date: Fri, 13 Mar 2026 12:49:22 +0100 Subject: [PATCH 1/2] AP-25563: Add diagnostic logging for intermittent Python gateway communication failures Logs gateway lifecycle events (creation, closure) with thread and identity information to help diagnose the root cause of "Cannot obtain a new communication channel" errors. Tracks gateway ownership through object hash codes for correlation. - DefaultPythonGateway.close(): Log PID and calling thread at INFO level - PythonScriptingSession: Log gateway hash and thread at creation (INFO) and shutdown (ERROR) - PythonGatewayTracker.clear(): Log process count and triggering thread at ERROR level - QueuedPythonGatewayFactory: Log eviction count and thread at gate-close (WARN) - PythonGatewayCreationGate: Include thread name in P2 phase event logs (INFO) - PythonScriptNodeModel: Handle no-cause "Cannot obtain" variant with improved error message When this error occurs again, correlating gateway hash and PID across log entries will reveal which code path triggered the unexpected shutdown. AP-25563 (Investigate "Cannot obtain a new communication channel" Python failures) --- .../scripting/nodes2/PythonScriptNodeModel.java | 13 ++++++++++++- .../scripting/nodes2/PythonScriptingSession.java | 9 +++++++++ .../org/knime/python3/DefaultPythonGateway.java | 2 ++ .../knime/python3/PythonGatewayCreationGate.java | 9 ++++++--- .../org/knime/python3/PythonGatewayTracker.java | 8 +++++--- .../knime/python3/QueuedPythonGatewayFactory.java | 12 +++++++----- 6 files changed, 41 insertions(+), 12 deletions(-) diff --git a/org.knime.python3.scripting.nodes/src/main/java/org/knime/python3/scripting/nodes2/PythonScriptNodeModel.java b/org.knime.python3.scripting.nodes/src/main/java/org/knime/python3/scripting/nodes2/PythonScriptNodeModel.java index 913586b36..ed61f9ee3 100644 --- a/org.knime.python3.scripting.nodes/src/main/java/org/knime/python3/scripting/nodes2/PythonScriptNodeModel.java +++ b/org.knime.python3.scripting.nodes/src/main/java/org/knime/python3/scripting/nodes2/PythonScriptNodeModel.java @@ -207,7 +207,7 @@ protected PortObject[] execute(final PortObject[] inObjects, final ExecutionCont /** * Throw a nicer error message if the exception we are seeing is an "Error while obtaining a new communication - * channel" + * channel" or "Cannot obtain a new communication channel" (CallbackClient already shut down) * * @param ex The exception * @throws KNIMEException A more human-readable exception @@ -220,6 +220,17 @@ private void handleNewCommunicationChannelError(final Py4JException ex) throws K "The Python process we prepared in the background got killed. Try again to start a new one."); throw KNIMEException.of(messageBuilder.build().orElseThrow(), ex); } + // Handle the case where the CallbackClient was already shut down (no cause) + if (ex.getCause() == null && ex.getMessage() != null + && ex.getMessage().startsWith("Cannot obtain a new communication channel")) { + var messageBuilder = createMessageBuilder(); + messageBuilder.withSummary("The Python communication channel was unexpectedly shut down before use. " + + "Details: " + ex.getMessage()); + messageBuilder.addResolutions("This is a known intermittent issue. Please try executing again.", + "Check the KNIME log for entries around this error for more information about " + + "what shut down the communication channel."); + throw KNIMEException.of(messageBuilder.build().orElseThrow(), ex); + } } private void runUserScript(final PythonScriptingSession session) diff --git a/org.knime.python3.scripting.nodes/src/main/java/org/knime/python3/scripting/nodes2/PythonScriptingSession.java b/org.knime.python3.scripting.nodes/src/main/java/org/knime/python3/scripting/nodes2/PythonScriptingSession.java index 33c4eb647..777bba127 100644 --- a/org.knime.python3.scripting.nodes/src/main/java/org/knime/python3/scripting/nodes2/PythonScriptingSession.java +++ b/org.knime.python3.scripting.nodes/src/main/java/org/knime/python3/scripting/nodes2/PythonScriptingSession.java @@ -159,6 +159,8 @@ final class PythonScriptingSession implements AsynchronousCloseable m_fileStoreHandlerSupplier = fileStoreHandlerSupplier; m_gateway = createGateway(pythonCommand); m_entryPoint = m_gateway.getEntryPoint(); + LOGGER.infoWithFormat("PythonScriptingSession created gateway (hash=%s) on thread '%s'", + Integer.toHexString(System.identityHashCode(m_gateway)), Thread.currentThread().getName()); m_tableConverter = new PythonArrowTableConverter(EXECUTOR_SERVICE, ARROW_STORE_FACTORY, fileStoreHandlerSupplier.getWriteFileStoreHandler()); m_outputRedirector = PythonGatewayUtils.redirectGatewayOutput(m_gateway, LOGGER::info, LOGGER::info); @@ -489,6 +491,13 @@ private void setCurrentWorkingDirectory() { Optional.ofNullable(workflowDirRef).map(r -> r.getFile().toString()) .ifPresent(m_entryPoint::setCurrentWorkingDirectory); } catch (Py4JException ex) { + // Log detailed diagnostics if the CallbackClient was already shut down + if (ex.getCause() == null && ex.getMessage() != null + && ex.getMessage().startsWith("Cannot obtain a new communication channel")) { + LOGGER.error("CallbackClient was shut down before first use. " + "Gateway hash=" + + Integer.toHexString(System.identityHashCode(m_gateway)) + ", thread='" + + Thread.currentThread().getName() + "'", ex); + } PythonProcessTerminatedException.throwIfTerminated(m_gateway, ex); throw ex; } catch (Exception ex) { // NOSONAR: We want to catch any exception here diff --git a/org.knime.python3/src/main/java/org/knime/python3/DefaultPythonGateway.java b/org.knime.python3/src/main/java/org/knime/python3/DefaultPythonGateway.java index c4fd6d206..eabc0eebe 100644 --- a/org.knime.python3/src/main/java/org/knime/python3/DefaultPythonGateway.java +++ b/org.knime.python3/src/main/java/org/knime/python3/DefaultPythonGateway.java @@ -327,6 +327,8 @@ public T getEntryPoint() { @Override public void close() throws IOException { if (m_clientServer != null) { + LOGGER.infoWithFormat("Closing PythonGateway (PID=%s) from thread '%s'", m_pid, + Thread.currentThread().getName()); m_entryPoint = null; m_clientServer.shutdown(); m_clientServer = null; diff --git a/org.knime.python3/src/main/java/org/knime/python3/PythonGatewayCreationGate.java b/org.knime.python3/src/main/java/org/knime/python3/PythonGatewayCreationGate.java index 62e507790..7b4a3541c 100644 --- a/org.knime.python3/src/main/java/org/knime/python3/PythonGatewayCreationGate.java +++ b/org.knime.python3/src/main/java/org/knime/python3/PythonGatewayCreationGate.java @@ -182,18 +182,21 @@ public void notify(final EventObject o) { if (o instanceof PhaseEvent && ((PhaseEvent)o).getPhaseId().equals(PhaseSetFactory.PHASE_INSTALL) && ((PhaseEvent)o).getType() == PhaseEvent.TYPE_START) { // lock if we enter the "install" phase - LOGGER.info("Blocking Python process startup during installation"); + LOGGER.info("Blocking Python process startup during installation (thread='" + + Thread.currentThread().getName() + "')"); INSTANCE.blockPythonCreation(); } else if (o instanceof PhaseEvent && ((PhaseEvent)o).getPhaseId().equals(PhaseSetFactory.PHASE_CONFIGURE) && ((PhaseEvent)o).getType() == PhaseEvent.TYPE_START) { // "configure" is the normal phase after install, so we can unlock Python processes again - LOGGER.info("Allowing Python process startup again after installation"); + LOGGER.info("Allowing Python process startup again after installation (thread='" + + Thread.currentThread().getName() + "')"); INSTANCE.allowPythonCreation(); } else if (o instanceof RollbackOperationEvent && !INSTANCE.isPythonGatewayCreationAllowed()) { // According to org.eclipse.equinox.internal.p2.engine.Engine.perform() -> L92, // a RollbackOperationEvent will be fired if an operation failed, and this event is only fired in that case, // so we unlock if we are currently locked. - LOGGER.info("Allowing Python process startup again after installation failed"); + LOGGER.info("Allowing Python process startup again after installation failed (thread='" + + Thread.currentThread().getName() + "')"); INSTANCE.allowPythonCreation(); } } diff --git a/org.knime.python3/src/main/java/org/knime/python3/PythonGatewayTracker.java b/org.knime.python3/src/main/java/org/knime/python3/PythonGatewayTracker.java index c9d18923d..63d5782ae 100644 --- a/org.knime.python3/src/main/java/org/knime/python3/PythonGatewayTracker.java +++ b/org.knime.python3/src/main/java/org/knime/python3/PythonGatewayTracker.java @@ -111,9 +111,11 @@ void clear() throws IOException { return; } - LOGGER.error("Found running Python processes. Aborting them to allow installation process. " - + "If this leads to failures in node execution, " - + "please restart those nodes once the installation has finished"); + LOGGER.errorWithFormat( + "Found running Python processes (%d). Aborting them to allow installation process. " + + "If this leads to failures in node execution, " + + "please restart those nodes once the installation has finished. Triggered from thread '%s'.", + m_openGateways.size(), Thread.currentThread().getName()); var exceptions = new ArrayList(); for (var gateway : m_openGateways) { diff --git a/org.knime.python3/src/main/java/org/knime/python3/QueuedPythonGatewayFactory.java b/org.knime.python3/src/main/java/org/knime/python3/QueuedPythonGatewayFactory.java index ad7ccf403..d0f6f7ecd 100644 --- a/org.knime.python3/src/main/java/org/knime/python3/QueuedPythonGatewayFactory.java +++ b/org.knime.python3/src/main/java/org/knime/python3/QueuedPythonGatewayFactory.java @@ -216,11 +216,13 @@ public void onPythonGatewayCreationGateOpen() { @Override public void onPythonGatewayCreationGateClose() { - evictGateways(// - m_gateways.values().stream()// - .flatMap(Collection::stream)// - .collect(Collectors.toList())// - ); + var gatewaysToEvict = m_gateways.values().stream()// + .flatMap(Collection::stream)// + .collect(Collectors.toList()); + LOGGER.warnWithFormat( + "PythonGatewayCreationGate closed: evicting %d queued gateways from thread '%s'", + gatewaysToEvict.size(), Thread.currentThread().getName()); + evictGateways(gatewaysToEvict); } }); } From 04b63b0a390fc33dd26369aa6501b3ee4cd7fdc8 Mon Sep 17 00:00:00 2001 From: Carsten Haubold Date: Mon, 30 Mar 2026 08:39:39 +0200 Subject: [PATCH 2/2] AP-25563: Synchronize access to m_gateway in creation gate listener Does not fix the linked ticket but is a correctness improvement. AP-25563 (Investigate "Cannot obtain a new communication channel" Python failures) --- .../python3/QueuedPythonGatewayFactory.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/org.knime.python3/src/main/java/org/knime/python3/QueuedPythonGatewayFactory.java b/org.knime.python3/src/main/java/org/knime/python3/QueuedPythonGatewayFactory.java index d0f6f7ecd..6b443e868 100644 --- a/org.knime.python3/src/main/java/org/knime/python3/QueuedPythonGatewayFactory.java +++ b/org.knime.python3/src/main/java/org/knime/python3/QueuedPythonGatewayFactory.java @@ -216,13 +216,15 @@ public void onPythonGatewayCreationGateOpen() { @Override public void onPythonGatewayCreationGateClose() { - var gatewaysToEvict = m_gateways.values().stream()// - .flatMap(Collection::stream)// - .collect(Collectors.toList()); - LOGGER.warnWithFormat( - "PythonGatewayCreationGate closed: evicting %d queued gateways from thread '%s'", - gatewaysToEvict.size(), Thread.currentThread().getName()); - evictGateways(gatewaysToEvict); + synchronized (PythonGatewayQueue.this) { + var gatewaysToEvict = m_gateways.values().stream()// + .flatMap(Collection::stream)// + .collect(Collectors.toList()); + LOGGER.warnWithFormat( + "PythonGatewayCreationGate closed: evicting %d queued gateways from thread '%s'", + gatewaysToEvict.size(), Thread.currentThread().getName()); + evictGateways(gatewaysToEvict); + } } }); }