Skip to content

Commit 86cd273

Browse files
committed
Fix 310: Nodes failing to lunch after startTimeout are dead
1 parent 800f64f commit 86cd273

4 files changed

Lines changed: 59 additions & 9 deletions

File tree

plugin/src/main/java/jenkins/plugins/openstack/compute/JCloudsComputer.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ public boolean isPendingDelete() {
9898
*/
9999
/*package*/ @CheckForNull OfflineCause getFatalOfflineCause() {
100100
OfflineCause oc = getOfflineCause();
101+
102+
if (getNode().isLaunchTimedOut() && oc instanceof OfflineCause.LaunchFailed) return oc;
103+
101104
return oc instanceof DiskSpaceMonitorDescriptor.DiskSpace || oc instanceof OfflineCause.ChannelTermination
102105
? oc
103106
: null

plugin/src/main/java/jenkins/plugins/openstack/compute/JCloudsSlave.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,16 @@ public long getCreatedTime() {
365365
return created;
366366
}
367367

368+
/**
369+
* @return True if the agent should have been up by now, and it is not. Note it could have been up momentarily before.
370+
*/
371+
public boolean isLaunchTimedOut() {
372+
JCloudsComputer computer = getComputer();
373+
if (computer != null && computer.isOnline()) return false;
374+
long existsFor = System.currentTimeMillis() - created;
375+
return existsFor > getSlaveOptions().getStartTimeout();
376+
}
377+
368378
@Override public JCloudsComputer getComputer() {
369379
return (JCloudsComputer) super.getComputer();
370380
}

plugin/src/main/java/jenkins/plugins/openstack/compute/JCloudsSlaveTemplate.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ public boolean canProvision(final Label label) {
223223

224224
String cause;
225225
while ((cause = cloud.slaveIsWaitingFor(node)) != null) {
226-
if ((System.currentTimeMillis() - node.getCreatedTime()) > timeout) {
226+
if (node.isLaunchTimedOut()) {
227227

228228
String timeoutMessage = String.format("Failed to connect agent %s within timeout (%d ms): %s", node.getNodeName(), timeout, cause);
229229
Error errorQuerying = null;

plugin/src/test/java/jenkins/plugins/openstack/compute/ProvisioningTest.java

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,7 @@
5353
import static org.hamcrest.Matchers.iterableWithSize;
5454
import static org.hamcrest.Matchers.not;
5555
import static org.hamcrest.Matchers.startsWith;
56-
import static org.junit.Assert.assertEquals;
57-
import static org.junit.Assert.assertFalse;
58-
import static org.junit.Assert.assertNotNull;
59-
import static org.junit.Assert.assertNull;
60-
import static org.junit.Assert.fail;
56+
import static org.junit.Assert.*;
6157
import static org.mockito.Matchers.any;
6258
import static org.mockito.Matchers.anyInt;
6359
import static org.mockito.Matchers.eq;
@@ -303,7 +299,7 @@ public void correctMetadataSet() throws Exception {
303299

304300
assertThat(cloud.getOpenstack().instanceUrl(), not(emptyString()));
305301
assertThat(cloud.getOpenstack().instanceFingerprint(), not(emptyString()));
306-
System.out.println(cloud.getOpenstack().instanceFingerprint());
302+
307303
Server server = template.provisionServer(null, null);
308304
Map<String, String> m = server.getMetadata();
309305
assertEquals(cloud.getOpenstack().instanceUrl(), m.get(Openstack.FINGERPRINT_KEY_URL));
@@ -340,7 +336,7 @@ public void timeoutProvisioning() throws Exception {
340336
}
341337

342338
@Test
343-
public void timeoutLaunching() throws Exception {
339+
public void timeoutLaunchingJnlp() throws Exception {
344340
final SlaveOptions opts = j.defaultSlaveOptions().getBuilder().startTimeout(1000).build();
345341
final JCloudsCloud cloud = j.configureSlaveProvisioningWithFloatingIP(j.dummyCloud(opts, j.dummySlaveTemplate("asdf")));
346342
final Iterable<NodeProvisioner.PlannedNode> pns = cloud.provision(Label.get("asdf"), 1);
@@ -359,7 +355,48 @@ public void timeoutLaunching() throws Exception {
359355
assertThat("Server details are printed", msg, containsString("Server state: Mock for "));
360356
}
361357

362-
// Wait for the server to be dereflleted
358+
// Wait for the server to be disposed
359+
AsyncResourceDisposer disposer = AsyncResourceDisposer.get();
360+
while (!disposer.getBacklog().isEmpty()) {
361+
Thread.sleep(1000);
362+
}
363+
verify(cloud.getOpenstack()).destroyServer(any(Server.class));
364+
}
365+
366+
@Test @Issue("https://github.com/jenkinsci/openstack-cloud-plugin/issues/310")
367+
public void timeoutLaunchingSsh() throws Exception {
368+
LauncherFactory.SSH sshLaunch = new LauncherFactory.SSH("no-such-creds");
369+
final SlaveOptions opts = j.defaultSlaveOptions().getBuilder().startTimeout(3000).launcherFactory(sshLaunch).build();
370+
final JCloudsCloud cloud = j.configureSlaveProvisioningWithFloatingIP(j.dummyCloud(opts, j.dummySlaveTemplate("asdf")));
371+
JCloudsSlave agent = j.provision(cloud, "asdf");
372+
JCloudsComputer computer = agent.getComputer();
373+
assertFalse(agent.isLaunchTimedOut());
374+
375+
OfflineCause ofc = null;
376+
for (int i = 0; i < 3; i++){
377+
ofc = computer.getOfflineCause();
378+
if (ofc != null) break;
379+
Thread.sleep(500);
380+
}
381+
assertThat(ofc, instanceOf(OfflineCause.LaunchFailed.class));
382+
// OfflineCause.LaunchFailed is NOT fatal until the stat timeout is up
383+
assertNull(computer.getFatalOfflineCause());
384+
assertFalse(agent.isLaunchTimedOut());
385+
386+
for (int i = 0; i < 4; i++){
387+
ofc = computer.getFatalOfflineCause();
388+
if (ofc != null) break;
389+
Thread.sleep(1000);
390+
}
391+
// OfflineCause.LaunchFailed will become fatal when the stat timeout is up
392+
long aliveFor = System.currentTimeMillis() - agent.getCreatedTime();
393+
assertTrue("Not timed out after ms " + aliveFor, agent.isLaunchTimedOut());
394+
//assertThat("Cause not fatal after ms " + aliveFor, computer.getFatalOfflineCause(), instanceOf(OfflineCause.LaunchFailed.class));
395+
assertThat("Cause not fatal after ms " + aliveFor, ofc, instanceOf(OfflineCause.LaunchFailed.class));
396+
397+
j.triggerOpenstackSlaveCleanup();
398+
399+
// Wait for the server to be disposed
363400
AsyncResourceDisposer disposer = AsyncResourceDisposer.get();
364401
while (!disposer.getBacklog().isEmpty()) {
365402
Thread.sleep(1000);

0 commit comments

Comments
 (0)