Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions infra/monitoring/alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,16 @@ groups:
summary: 'DB connection pool >90% utilised for 5m'
description: 'Pool utilisation is {{ $value | humanizePercentage }}. New requests may queue or fail.'

- alert: DBPoolUtilizationHigh
expr: db_pool_utilization > 0.8
for: 5m
labels:
severity: warning
service: teachlink-backend
annotations:
summary: 'DB connection pool utilization above 80% for 5m'
description: 'Pool utilization is {{ $value | humanizePercentage }}. Consider raising DATABASE_POOL_MAX or investigating slow queries.'

- alert: DBQueryLatencyHigh
expr: |
histogram_quantile(
Expand Down
60 changes: 17 additions & 43 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
"helmet": "^8.0.0",
"ioredis": "^5.9.3",
"joi": "^18.1.2",
"lru-cache": "^11.0.0",
"jwks-rsa": "^4.0.1",
"multer": "^2.0.1",
"murmurhash-js": "^1.0.0",
Expand Down
3 changes: 3 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 23 additions & 2 deletions src/monitoring/metrics/db-pool-metrics.collector.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,30 @@ describe('DbPoolMetricsCollector', () => {
const metricsStr = await metricsService.getMetrics();

expect(metricsStr).toContain('db_pool_size 10');
expect(metricsStr).toContain('db_active_connections 6'); // totalCount(10) - idleCount(4)
expect(metricsStr).toContain('db_pool_active_connections 6'); // totalCount(10) - idleCount(4)
expect(metricsStr).toContain('db_pool_idle_connections 4');
expect(metricsStr).toContain('db_pool_pending_requests 2');
expect(metricsStr).toContain('db_pool_waiting_requests 2');
});

it('should expose the configured max connections as a gauge', async () => {
collector.collectPoolMetrics();
const metricsStr = await metricsService.getMetrics();
// Default DATABASE_POOL_MAX is 30 per pool.config.ts
expect(metricsStr).toMatch(/db_pool_max_connections 30/);
});

it('should expose pool utilisation as a ratio in [0, 1]', async () => {
// With totalCount=10 and default max=30: util = 10/30 ≈ 0.3333
collector.collectPoolMetrics();
let metricsStr = await metricsService.getMetrics();
expect(metricsStr).toMatch(/db_pool_utilization 0.2/);

// Saturate the pool: totalCount=30, max=30 => util=1
mockPgPool.totalCount = 30;
mockPgPool.idleCount = 0;
collector.collectPoolMetrics();
metricsStr = await metricsService.getMetrics();
expect(metricsStr).toMatch(/db_pool_utilization 1/);
});

it('should wrap pgPool.connect and track wait metrics', async () => {
Expand Down
39 changes: 22 additions & 17 deletions src/monitoring/metrics/db-pool-metrics.collector.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { Injectable, Logger, OnModuleInit } from '@nestjs/common';
import { Cron, CronExpression } from '@nestjs/schedule';
import { Cron } from '@nestjs/schedule';
import { InjectDataSource } from '@nestjs/typeorm';
import { DataSource } from 'typeorm';
import { MetricsCollectionService } from './metrics-collection.service';
Expand All @@ -8,17 +8,15 @@ import { resolvePoolConfig } from '../../database/pool';
/**
* Database Pool Metrics Collector
*
* Runs on a 10-second cron schedule and pushes TypeORM / pg connection pool
* statistics into Prometheus gauges and counters defined in
* `MetricsCollectionService`.
* Polls the TypeORM / pg connection pool every 15 seconds and pushes
* statistics into Prometheus gauges defined in `MetricsCollectionService`.
*
* Exposed metrics:
* - `db_pool_size` – Total pool slots (active + idle)
* - `db_pool_active_connections` – Currently checked-out connections
* - `db_pool_idle_connections` – Idle / available connections
* - `db_pool_pending_requests` – Requests waiting for a free slot
* - `db_pool_connections_acquired_total` – Monotonically increasing acquire counter
* - `db_pool_connections_released_total` – Monotonically increasing release counter
* Exposed metrics (per spec for issue #883):
* - `db_pool_active_connections` – Currently checked-out connections
* - `db_pool_idle_connections` – Idle / available connections
* - `db_pool_waiting_requests` – Requests waiting for a free slot
* - `db_pool_max_connections` – Configured maximum pool capacity
* - `db_pool_utilization` – Ratio active/max in [0, 1] (for alerting)
*
* The underlying `pg` driver exposes pool internals via the non-standard
* `driver.pool` property on the TypeORM DataSource. We access it through a
Expand All @@ -36,7 +34,7 @@ export class DbPoolMetricsCollector implements OnModuleInit {
) {}

onModuleInit(): void {
this.logger.log('DbPoolMetricsCollector initialised – will poll pool stats every 10 s');
this.logger.log('DbPoolMetricsCollector initialised – will poll pool stats every 15 s');
// Collect an initial snapshot immediately
this.collectPoolMetrics();
this.setupPoolEventListeners();
Expand Down Expand Up @@ -123,27 +121,34 @@ export class DbPoolMetricsCollector implements OnModuleInit {
/**
* Scheduled job – polls pool statistics every 15 seconds.
*/
@Cron(CronExpression.EVERY_10_SECONDS)
@Cron('*/15 * * * * *')
collectPoolMetrics(): void {
try {
const pool = this.getPool();
if (!pool) {
return; // DataSource not yet initialised or using an unsupported driver
// Even when pool is unavailable, expose the configured max so dashboards
// do not show a stale or missing value.
this.metricsCollectionService.dbPoolMaxConnections.set(this.config.max);
return;
}

const totalCount: number = pool.totalCount ?? 0;
const idleCount: number = pool.idleCount ?? 0;
const waitingCount: number = pool.waitingCount ?? 0;
const activeCount = totalCount - idleCount;
const max = this.config.max;
const utilization = max > 0 ? activeCount / max : 0;

// Update gauges
this.metricsCollectionService.dbPoolSize.set(totalCount);
this.metricsCollectionService.activeConnections.set(activeCount);
this.metricsCollectionService.dbPoolActiveConnections.set(activeCount);
this.metricsCollectionService.dbPoolIdleConnections.set(idleCount);
this.metricsCollectionService.dbPoolPendingRequests.set(waitingCount);
this.metricsCollectionService.dbPoolWaitingRequests.set(waitingCount);
this.metricsCollectionService.dbPoolMaxConnections.set(max);
this.metricsCollectionService.dbPoolUtilization.set(utilization);

this.logger.debug(
`Pool snapshot – total=${totalCount} active=${activeCount} idle=${idleCount} waiting=${waitingCount}`,
`Pool snapshot – total=${totalCount} active=${activeCount} idle=${idleCount} waiting=${waitingCount} util=${(utilization * 100).toFixed(1)}%`,
);
} catch (err) {
this.logger.warn(
Expand Down
30 changes: 23 additions & 7 deletions src/monitoring/metrics/metrics-collection.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,22 @@ export class MetricsCollectionService implements OnModuleInit {
// ── Infrastructure – Database ─────────────────────────────────────────────

public dbQueryDuration: Histogram;
public activeConnections: Gauge;
public dbPoolActiveConnections: Gauge;

/** Total DB pool connections acquired since startup */
public dbPoolConnectionsAcquired: Counter;
/** Total DB pool connections released since startup */
public dbPoolConnectionsReleased: Counter;
/** Current DB connection pool size (active + idle) */
public dbPoolSize: Gauge;
/** Configured maximum DB connection pool capacity */
public dbPoolMaxConnections: Gauge;
/** Current pool utilisation as a ratio in [0, 1] */
public dbPoolUtilization: Gauge;
/** Currently idle / available pool connections */
public dbPoolIdleConnections: Gauge;
/** Requests queued waiting for a free pool slot */
public dbPoolPendingRequests: Gauge;
public dbPoolWaitingRequests: Gauge;
/** Total number of DB pool connections that had to wait since startup */
public dbPoolWaitCount: Counter;
/** Duration of database connection checkout waiting in seconds */
Expand Down Expand Up @@ -249,9 +253,9 @@ export class MetricsCollectionService implements OnModuleInit {
});

// Database – connections
this.activeConnections = new Gauge({
name: 'db_active_connections',
help: 'Number of currently active database connections',
this.dbPoolActiveConnections = new Gauge({
name: 'db_pool_active_connections',
help: 'Number of currently active (checked-out) connections in the DB pool',
registers: [this.registry],
});

Expand All @@ -273,14 +277,26 @@ export class MetricsCollectionService implements OnModuleInit {
registers: [this.registry],
});

this.dbPoolMaxConnections = new Gauge({
name: 'db_pool_max_connections',
help: 'Configured maximum DB connection pool capacity',
registers: [this.registry],
});

this.dbPoolUtilization = new Gauge({
name: 'db_pool_utilization',
help: 'Current DB pool utilisation as a ratio in [0, 1] (active / max)',
registers: [this.registry],
});

this.dbPoolIdleConnections = new Gauge({
name: 'db_pool_idle_connections',
help: 'Number of idle (available) connections in the DB pool',
registers: [this.registry],
});

this.dbPoolPendingRequests = new Gauge({
name: 'db_pool_pending_requests',
this.dbPoolWaitingRequests = new Gauge({
name: 'db_pool_waiting_requests',
help: 'Number of requests waiting for a free DB pool connection',
registers: [this.registry],
});
Expand Down
Loading
Loading