From f71787eac19fb9f69888317f61cadf69a9e03119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20HONORE?= Date: Fri, 9 Jan 2026 07:30:23 +0100 Subject: [PATCH] feat: add `monitor_uptime_ratio` and `monitor_response_time_seconds` prometheus metric (#5506) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: François HONORE Co-authored-by: Frank Elsinga Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- server/model/monitor.js | 7 ++-- server/prometheus.js | 68 ++++++++++++++++++++++++++++++++++--- server/uptime-calculator.js | 2 +- 3 files changed, 69 insertions(+), 8 deletions(-) diff --git a/server/model/monitor.js b/server/model/monitor.js index 33256fbfd..f05ddd744 100644 --- a/server/model/monitor.js +++ b/server/model/monitor.js @@ -1044,7 +1044,10 @@ class Monitor extends BeanModel { await R.store(bean); log.debug("monitor", `[${this.name}] prometheus.update`); - this.prometheus?.update(bean, tlsInfo); + const data24h = uptimeCalculator.get24Hour(); + const data30d = uptimeCalculator.get30Day(); + const data1y = uptimeCalculator.get1Year(); + this.prometheus?.update(bean, tlsInfo, { data24h, data30d, data1y }); previousBeat = bean; @@ -1952,7 +1955,7 @@ class Monitor extends BeanModel { */ async handleTlsInfo(tlsInfo) { await this.updateTlsInfo(tlsInfo); - this.prometheus?.update(null, tlsInfo); + this.prometheus?.update(null, tlsInfo, null); if (!this.getIgnoreTls() && this.isEnabledExpiryNotification()) { log.debug("monitor", `[${this.name}] call checkCertExpiryNotifications`); diff --git a/server/prometheus.js b/server/prometheus.js index d446915d3..adc1872e0 100644 --- a/server/prometheus.js +++ b/server/prometheus.js @@ -4,6 +4,8 @@ const { R } = require("redbean-node"); let monitorCertDaysRemaining = null; let monitorCertIsValid = null; +let monitorUptimeRatio = null; +let monitorAverageResponseTimeSeconds = null; let monitorResponseTime = null; let monitorStatus = null; @@ -69,6 +71,18 @@ class Prometheus { labelNames: commonLabels, }); + monitorUptimeRatio = new PrometheusClient.Gauge({ + name: "monitor_uptime_ratio", + help: "Uptime ratio calculated over sliding window specified by the 'window' label. (0.0 - 1.0)", + labelNames: [...commonLabels, "window"], + }); + + monitorAverageResponseTimeSeconds = new PrometheusClient.Gauge({ + name: "monitor_response_time_seconds", + help: "Average response time in seconds calculated over sliding window specified by the 'window' label", + labelNames: [...commonLabels, "window"], + }); + monitorResponseTime = new PrometheusClient.Gauge({ name: "monitor_response_time", help: "Monitor Response Time (ms)", @@ -130,11 +144,13 @@ class Prometheus { /** * Update the metrics page + * @typedef {import("./uptime-calculator").UptimeDataResult} UptimeDataResult * @param {object} heartbeat Heartbeat details * @param {object} tlsInfo TLS details + * @param {{data24h: UptimeDataResult, data30d: UptimeDataResult, data1y:UptimeDataResult} | null} uptime the uptime and average response rate over a variety of fixed windows * @returns {void} */ - update(heartbeat, tlsInfo) { + update(heartbeat, tlsInfo, uptime) { if (typeof tlsInfo !== "undefined") { try { let isValid; @@ -145,8 +161,7 @@ class Prometheus { } monitorCertIsValid.set(this.monitorLabelValues, isValid); } catch (e) { - log.error("prometheus", "Caught error"); - log.error("prometheus", e); + log.error("prometheus", "Caught error", e); } try { @@ -154,8 +169,49 @@ class Prometheus { monitorCertDaysRemaining.set(this.monitorLabelValues, tlsInfo.certInfo.daysRemaining); } } catch (e) { - log.error("prometheus", "Caught error"); - log.error("prometheus", e); + log.error("prometheus", "Caught error", e); + } + } + + if (uptime) { + try { + monitorAverageResponseTimeSeconds.set( + { ...this.monitorLabelValues, window: "1d" }, + uptime.data24h.avgPing / 1000 + ); + } catch (e) { + log.error("prometheus", "Caught error", e); + } + try { + monitorAverageResponseTimeSeconds.set( + { ...this.monitorLabelValues, window: "30d" }, + uptime.data30d.avgPing / 1000 + ); + } catch (e) { + log.error("prometheus", "Caught error", e); + } + try { + monitorAverageResponseTimeSeconds.set( + { ...this.monitorLabelValues, window: "365d" }, + uptime.data1y.avgPing / 1000 + ); + } catch (e) { + log.error("prometheus", "Caught error", e); + } + try { + monitorUptimeRatio.set({ ...this.monitorLabelValues, window: "1d" }, uptime.data24h.uptime); + } catch (e) { + log.error("prometheus", "Caught error", e); + } + try { + monitorUptimeRatio.set({ ...this.monitorLabelValues, window: "30d" }, uptime.data30d.uptime); + } catch (e) { + log.error("prometheus", "Caught error", e); + } + try { + monitorUptimeRatio.set({ ...this.monitorLabelValues, window: "365d" }, uptime.data1y.uptime); + } catch (e) { + log.error("prometheus", "Caught error", e); } } @@ -189,6 +245,8 @@ class Prometheus { try { monitorCertDaysRemaining.remove(this.monitorLabelValues); monitorCertIsValid.remove(this.monitorLabelValues); + monitorUptimeRatio.remove(this.monitorLabelValues); + monitorAverageResponseTimeSeconds.remove(this.monitorLabelValues); monitorResponseTime.remove(this.monitorLabelValues); monitorStatus.remove(this.monitorLabelValues); } catch (e) { diff --git a/server/uptime-calculator.js b/server/uptime-calculator.js index 1039c3b42..94d7e7733 100644 --- a/server/uptime-calculator.js +++ b/server/uptime-calculator.js @@ -206,7 +206,7 @@ class UptimeCalculator { * @param {number} status status * @param {number} ping Ping * @param {dayjs.Dayjs} date Date (Only for migration) - * @returns {dayjs.Dayjs} date + * @returns {Promise} date * @throws {Error} Invalid status */ async update(status, ping = 0, date) {