fix: RabbitMQ monitor to more properly handle all nodes failure (#6646)
Co-authored-by: Frank Elsinga <frank@elsinga.de>
This commit is contained in:
parent
5accda390e
commit
27c0ae8f1e
@ -17,45 +17,83 @@ class RabbitMqMonitorType extends MonitorType {
|
|||||||
throw new Error("Invalid RabbitMQ Nodes");
|
throw new Error("Invalid RabbitMQ Nodes");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let baseUrl of baseUrls) {
|
if (baseUrls.length === 0) {
|
||||||
|
throw new Error("No RabbitMQ nodes configured");
|
||||||
|
}
|
||||||
|
|
||||||
|
const errors = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < baseUrls.length; i++) {
|
||||||
|
const baseUrl = baseUrls[i];
|
||||||
|
const nodeIndex = i + 1;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Without a trailing slash, path in baseUrl will be removed. https://example.com/api -> https://example.com
|
await this.checkSingleNode(monitor, baseUrl, `${nodeIndex}/${baseUrls.length}`);
|
||||||
if ( !baseUrl.endsWith("/") ) {
|
// If checkSingleNode succeeds (doesn't throw), set heartbeat to UP
|
||||||
baseUrl += "/";
|
heartbeat.status = UP;
|
||||||
}
|
heartbeat.msg = baseUrls.length === 1 ? "Node is reachable and there are no alerts in the cluster" : `One of the ${baseUrls.length} nodes is reachable and there are no alerts in the cluster`;
|
||||||
const options = {
|
return;
|
||||||
// Do not start with slash, it will strip the trailing slash from baseUrl
|
|
||||||
url: new URL("api/health/checks/alarms/", baseUrl).href,
|
|
||||||
method: "get",
|
|
||||||
timeout: monitor.timeout * 1000,
|
|
||||||
headers: {
|
|
||||||
"Accept": "application/json",
|
|
||||||
"Authorization": "Basic " + Buffer.from(`${monitor.rabbitmqUsername || ""}:${monitor.rabbitmqPassword || ""}`).toString("base64"),
|
|
||||||
},
|
|
||||||
signal: axiosAbortSignal((monitor.timeout + 10) * 1000),
|
|
||||||
// Capture reason for 503 status
|
|
||||||
validateStatus: (status) => status === 200 || status === 503,
|
|
||||||
};
|
|
||||||
log.debug("monitor", `[${monitor.name}] Axios Request: ${JSON.stringify(options)}`);
|
|
||||||
const res = await axios.request(options);
|
|
||||||
log.debug("monitor", `[${monitor.name}] Axios Response: status=${res.status} body=${JSON.stringify(res.data)}`);
|
|
||||||
if (res.status === 200) {
|
|
||||||
heartbeat.status = UP;
|
|
||||||
heartbeat.msg = "OK";
|
|
||||||
break;
|
|
||||||
} else if (res.status === 503) {
|
|
||||||
throw new Error(res.data.reason);
|
|
||||||
} else {
|
|
||||||
throw new Error(`${res.status} - ${res.statusText}`);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (axios.isCancel(error)) {
|
log.warn(this.name, `Node ${nodeIndex}: ${error.message}`);
|
||||||
log.debug("monitor", `[${monitor.name}] Request timed out`);
|
errors.push(`Node ${nodeIndex}: ${error.message}`);
|
||||||
throw new Error("Request timed out");
|
}
|
||||||
} else {
|
}
|
||||||
log.debug("monitor", `[${monitor.name}] Axios Error: ${JSON.stringify(error.message)}`);
|
|
||||||
throw new Error(error.message);
|
// If we reach here, all nodes failed
|
||||||
}
|
throw new Error(`All ${errors.length} nodes failed because ${errors.join("; ")}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check a single RabbitMQ node
|
||||||
|
* @param {object} monitor Monitor configuration
|
||||||
|
* @param {string} baseUrl Base URL of the RabbitMQ node
|
||||||
|
* @param {string} nodeInfo Node index info for logging (e.g., "1/3")
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
* @throws {Error} If the node check fails
|
||||||
|
*/
|
||||||
|
async checkSingleNode(monitor, baseUrl, nodeInfo) {
|
||||||
|
// Without a trailing slash, path in baseUrl will be removed. https://example.com/api -> https://example.com
|
||||||
|
let normalizedUrl = baseUrl;
|
||||||
|
if (!normalizedUrl.endsWith("/")) {
|
||||||
|
normalizedUrl += "/";
|
||||||
|
}
|
||||||
|
|
||||||
|
const options = {
|
||||||
|
// Do not start with slash, it will strip the trailing slash from baseUrl
|
||||||
|
url: new URL("api/health/checks/alarms/", normalizedUrl).href,
|
||||||
|
method: "get",
|
||||||
|
timeout: monitor.timeout * 1000,
|
||||||
|
headers: {
|
||||||
|
"Accept": "application/json",
|
||||||
|
"Authorization": "Basic " + Buffer.from(`${monitor.rabbitmqUsername || ""}:${monitor.rabbitmqPassword || ""}`).toString("base64"),
|
||||||
|
},
|
||||||
|
signal: axiosAbortSignal((monitor.timeout + 10) * 1000),
|
||||||
|
// Capture reason for 503 status
|
||||||
|
validateStatus: (status) => status === 200 || status === 503,
|
||||||
|
};
|
||||||
|
|
||||||
|
log.debug("monitor", `[${monitor.name}] Checking node ${nodeInfo}: ${baseUrl}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await axios.request(options);
|
||||||
|
log.debug("monitor", `[${monitor.name}] Axios Response: status=${res.status} body=${JSON.stringify(res.data)}`);
|
||||||
|
|
||||||
|
if (res.status === 200) {
|
||||||
|
log.debug("monitor", `[${monitor.name}] Node ${nodeInfo} is healthy`);
|
||||||
|
// Success - return without error
|
||||||
|
} else if (res.status === 503) {
|
||||||
|
throw new Error(res.data.reason);
|
||||||
|
} else {
|
||||||
|
throw new Error(`${res.status} - ${res.statusText}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
if (axios.isCancel(error)) {
|
||||||
|
throw new Error("Request timed out");
|
||||||
|
} else if (error.response) {
|
||||||
|
// Re-throw with the original error message if it's already formatted
|
||||||
|
throw error;
|
||||||
|
} else {
|
||||||
|
throw new Error(error.message);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -17,6 +17,7 @@ describe("RabbitMQ Single Node", {
|
|||||||
rabbitmqNodes: JSON.stringify([ connectionString ]),
|
rabbitmqNodes: JSON.stringify([ connectionString ]),
|
||||||
rabbitmqUsername: "guest",
|
rabbitmqUsername: "guest",
|
||||||
rabbitmqPassword: "guest",
|
rabbitmqPassword: "guest",
|
||||||
|
timeout: 10,
|
||||||
};
|
};
|
||||||
|
|
||||||
const heartbeat = {
|
const heartbeat = {
|
||||||
@ -27,7 +28,7 @@ describe("RabbitMQ Single Node", {
|
|||||||
try {
|
try {
|
||||||
await rabbitMQMonitor.check(monitor, heartbeat, {});
|
await rabbitMQMonitor.check(monitor, heartbeat, {});
|
||||||
assert.strictEqual(heartbeat.status, UP);
|
assert.strictEqual(heartbeat.status, UP);
|
||||||
assert.strictEqual(heartbeat.msg, "OK");
|
assert.strictEqual(heartbeat.msg, "Node is reachable and there are no alerts in the cluster");
|
||||||
} finally {
|
} finally {
|
||||||
rabbitMQContainer.stop();
|
rabbitMQContainer.stop();
|
||||||
}
|
}
|
||||||
@ -39,6 +40,7 @@ describe("RabbitMQ Single Node", {
|
|||||||
rabbitmqNodes: JSON.stringify([ "http://localhost:15672" ]),
|
rabbitmqNodes: JSON.stringify([ "http://localhost:15672" ]),
|
||||||
rabbitmqUsername: "rabbitmqUser",
|
rabbitmqUsername: "rabbitmqUser",
|
||||||
rabbitmqPassword: "rabbitmqPass",
|
rabbitmqPassword: "rabbitmqPass",
|
||||||
|
timeout: 10,
|
||||||
};
|
};
|
||||||
|
|
||||||
const heartbeat = {
|
const heartbeat = {
|
||||||
@ -55,4 +57,193 @@ describe("RabbitMQ Single Node", {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("checkSingleNode() succeeds when node is healthy", async () => {
|
||||||
|
const rabbitMQContainer = await new RabbitMQContainer().withStartupTimeout(60000).start();
|
||||||
|
const rabbitMQMonitor = new RabbitMqMonitorType();
|
||||||
|
const connectionString = `http://${rabbitMQContainer.getHost()}:${rabbitMQContainer.getMappedPort(15672)}`;
|
||||||
|
|
||||||
|
const monitor = {
|
||||||
|
name: "Test Monitor",
|
||||||
|
rabbitmqUsername: "guest",
|
||||||
|
rabbitmqPassword: "guest",
|
||||||
|
timeout: 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Should not throw - just validates the node is healthy
|
||||||
|
await rabbitMQMonitor.checkSingleNode(monitor, connectionString, "1/1");
|
||||||
|
} finally {
|
||||||
|
rabbitMQContainer.stop();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("checkSingleNode() throws error when node is unreachable", async () => {
|
||||||
|
const rabbitMQMonitor = new RabbitMqMonitorType();
|
||||||
|
const monitor = {
|
||||||
|
name: "Test Monitor",
|
||||||
|
rabbitmqUsername: "guest",
|
||||||
|
rabbitmqPassword: "guest",
|
||||||
|
timeout: 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Should reject with any error (connection refused, timeout, etc.)
|
||||||
|
await assert.rejects(
|
||||||
|
rabbitMQMonitor.checkSingleNode(monitor, "http://localhost:15672", "1/1"),
|
||||||
|
Error
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("RabbitMQ Multi-Node (Mocked)", () => {
|
||||||
|
test("check() succeeds when first node is healthy", async () => {
|
||||||
|
const rabbitMQMonitor = new RabbitMqMonitorType();
|
||||||
|
const monitor = {
|
||||||
|
rabbitmqNodes: JSON.stringify([ "http://node1:15672", "http://node2:15672" ]),
|
||||||
|
rabbitmqUsername: "guest",
|
||||||
|
rabbitmqPassword: "guest",
|
||||||
|
timeout: 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
const heartbeat = {
|
||||||
|
msg: "",
|
||||||
|
status: PENDING,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Mock checkSingleNode to succeed on first call (just don't throw)
|
||||||
|
let callCount = 0;
|
||||||
|
rabbitMQMonitor.checkSingleNode = async (mon, url, nodeInfo) => {
|
||||||
|
callCount++;
|
||||||
|
// Success - don't throw
|
||||||
|
};
|
||||||
|
|
||||||
|
await rabbitMQMonitor.check(monitor, heartbeat, {});
|
||||||
|
assert.strictEqual(heartbeat.status, UP);
|
||||||
|
assert.strictEqual(heartbeat.msg, "One of the 2 nodes is reachable and there are no alerts in the cluster");
|
||||||
|
assert.strictEqual(callCount, 1, "Should only check first node");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check() succeeds when second node is healthy after first fails", async () => {
|
||||||
|
const rabbitMQMonitor = new RabbitMqMonitorType();
|
||||||
|
const monitor = {
|
||||||
|
rabbitmqNodes: JSON.stringify([ "http://node1:15672", "http://node2:15672" ]),
|
||||||
|
rabbitmqUsername: "guest",
|
||||||
|
rabbitmqPassword: "guest",
|
||||||
|
timeout: 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
const heartbeat = {
|
||||||
|
msg: "",
|
||||||
|
status: PENDING,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Mock checkSingleNode to fail first, succeed second
|
||||||
|
let callCount = 0;
|
||||||
|
rabbitMQMonitor.checkSingleNode = async (mon, url, nodeInfo) => {
|
||||||
|
callCount++;
|
||||||
|
if (callCount === 1) {
|
||||||
|
throw new Error("Node 1 connection failed");
|
||||||
|
}
|
||||||
|
// Second call succeeds - don't throw
|
||||||
|
};
|
||||||
|
|
||||||
|
await rabbitMQMonitor.check(monitor, heartbeat, {});
|
||||||
|
assert.strictEqual(heartbeat.status, UP);
|
||||||
|
assert.strictEqual(heartbeat.msg, "One of the 2 nodes is reachable and there are no alerts in the cluster");
|
||||||
|
assert.strictEqual(callCount, 2, "Should check both nodes");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check() fails with consolidated error when all nodes are down", async () => {
|
||||||
|
const rabbitMQMonitor = new RabbitMqMonitorType();
|
||||||
|
const monitor = {
|
||||||
|
rabbitmqNodes: JSON.stringify([
|
||||||
|
"http://node1:15672",
|
||||||
|
"http://node2:15672",
|
||||||
|
"http://node3:15672"
|
||||||
|
]),
|
||||||
|
rabbitmqUsername: "guest",
|
||||||
|
rabbitmqPassword: "guest",
|
||||||
|
timeout: 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
const heartbeat = {
|
||||||
|
msg: "",
|
||||||
|
status: PENDING,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Mock checkSingleNode to always fail
|
||||||
|
let callCount = 0;
|
||||||
|
rabbitMQMonitor.checkSingleNode = async (mon, url, nodeInfo) => {
|
||||||
|
callCount++;
|
||||||
|
throw new Error(`Connection failed to node ${callCount}`);
|
||||||
|
};
|
||||||
|
|
||||||
|
await assert.rejects(
|
||||||
|
rabbitMQMonitor.check(monitor, heartbeat, {}),
|
||||||
|
(error) => {
|
||||||
|
assert.match(error.message, /All 3 nodes failed/);
|
||||||
|
assert.match(error.message, /Node 1:/);
|
||||||
|
assert.match(error.message, /Node 2:/);
|
||||||
|
assert.match(error.message, /Node 3:/);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
assert.strictEqual(callCount, 3, "Should check all three nodes");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check() fails when no nodes are configured", async () => {
|
||||||
|
const rabbitMQMonitor = new RabbitMqMonitorType();
|
||||||
|
const monitor = {
|
||||||
|
rabbitmqNodes: JSON.stringify([]),
|
||||||
|
rabbitmqUsername: "guest",
|
||||||
|
rabbitmqPassword: "guest",
|
||||||
|
timeout: 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
const heartbeat = {
|
||||||
|
msg: "",
|
||||||
|
status: PENDING,
|
||||||
|
};
|
||||||
|
|
||||||
|
await assert.rejects(
|
||||||
|
rabbitMQMonitor.check(monitor, heartbeat, {}),
|
||||||
|
/No RabbitMQ nodes configured/
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check() tries all nodes before failing", async () => {
|
||||||
|
const rabbitMQMonitor = new RabbitMqMonitorType();
|
||||||
|
const monitor = {
|
||||||
|
rabbitmqNodes: JSON.stringify([
|
||||||
|
"http://node1:15672",
|
||||||
|
"http://node2:15672",
|
||||||
|
"http://node3:15672",
|
||||||
|
"http://node4:15672"
|
||||||
|
]),
|
||||||
|
rabbitmqUsername: "guest",
|
||||||
|
rabbitmqPassword: "guest",
|
||||||
|
timeout: 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
const heartbeat = {
|
||||||
|
msg: "",
|
||||||
|
status: PENDING,
|
||||||
|
};
|
||||||
|
|
||||||
|
const checkedNodes = [];
|
||||||
|
rabbitMQMonitor.checkSingleNode = async (mon, url, nodeInfo) => {
|
||||||
|
checkedNodes.push(url);
|
||||||
|
throw new Error(`Failed: ${url}`);
|
||||||
|
};
|
||||||
|
|
||||||
|
await assert.rejects(
|
||||||
|
rabbitMQMonitor.check(monitor, heartbeat, {}),
|
||||||
|
/All 4 nodes failed/
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.strictEqual(checkedNodes.length, 4, "Should check all 4 nodes");
|
||||||
|
assert.strictEqual(checkedNodes[0], "http://node1:15672");
|
||||||
|
assert.strictEqual(checkedNodes[1], "http://node2:15672");
|
||||||
|
assert.strictEqual(checkedNodes[2], "http://node3:15672");
|
||||||
|
assert.strictEqual(checkedNodes[3], "http://node4:15672");
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user