app.get('/api/health',
async (req, res) => {
const checks = {
server: 'ok',
uptime: process.uptime(),
timestamp: new Date().toISOString(),
memory: {
used: Math.round(
process.memoryUsage().heapUsed
/ 1024 / 1024),
total: Math.round(
process.memoryUsage().heapTotal
/ 1024 / 1024),
unit: 'MB',
},
database: 'checking...',
redis: 'checking...',
};
try {
await db.command({ ping: 1 });
checks.database = 'ok';
} catch {
checks.database = 'error';
}
try {
await redis.ping();
checks.redis = 'ok';
} catch {
checks.redis = 'error';
}
const allOk = checks.database === 'ok'
&& checks.redis === 'ok';
res.status(allOk ? 200 : 503)
.json(checks);
});
// middleware/metrics.ts
interface RequestMetric {
method: string;
path: string;
statusCode: number;
duration: number;
timestamp: number;
}
const metrics: RequestMetric[] = [];
app.use((req, res, next) => {
const start = performance.now();
res.on('finish', () => {
metrics.push({
method: req.method,
path: req.route?.path ?? req.path,
statusCode: res.statusCode,
duration: performance.now()
- start,
timestamp: Date.now(),
});
// Keep last 10,000 metrics
if (metrics.length > 10000)
metrics.splice(0, 5000);
});
next();
});
// Metrics API
app.get('/api/admin/metrics',
requireAdmin,
(req, res) => {
const window = 5 * 60 * 1000; // 5min
const recent = metrics.filter(
m => m.timestamp > Date.now()
- window);
const durations = recent
.map(m => m.duration).sort(
(a, b) => a - b);
res.json({
requestCount: recent.length,
errorRate: recent.filter(
m => m.statusCode >= 500).length
/ recent.length,
latency: {
p50: percentile(durations, 50),
p95: percentile(durations, 95),
p99: percentile(durations, 99),
},
topEndpoints: groupAndCount(
recent, 'path'),
});
});
function percentile(
arr: number[], p: number
) {
const i = Math.ceil(
arr.length * p / 100) - 1;
return arr[Math.max(0, i)] ?? 0;
}
app.get('/api/admin/stats',
requireAdmin,
async (req, res) => {
const [
totalUsers,
activeToday,
totalImages,
storageBytes,
] = await Promise.all([
db.collection('users')
.countDocuments(),
db.collection('users')
.countDocuments({
lastActive: {
$gte: new Date(
Date.now() - 86400000)
},
}),
db.collection('images')
.countDocuments(),
db.collection('images')
.aggregate([{
$group: {
_id: null,
total: { $sum: '$fileSize' },
}
}]).toArray()
.then(r => r[0]?.total ?? 0),
]);
res.json({
users: { total: totalUsers,
activeToday },
images: { total: totalImages },
storage: {
bytes: storageBytes,
formatted: formatBytes(
storageBytes),
},
});
});
// lib/alerting.ts
interface AlertRule {
name: string;
condition: () => Promise<boolean>;
message: string;
cooldown: number; // ms
}
const rules: AlertRule[] = [
{
name: 'high-error-rate',
condition: async () => {
const m = await getMetrics();
return m.errorRate > 0.05;
},
message: '🔴 Error rate > 5%',
cooldown: 15 * 60 * 1000,
},
{
name: 'slow-p95',
condition: async () => {
const m = await getMetrics();
return m.latency.p95 > 2000;
},
message: '🟡 P95 latency > 2s',
cooldown: 30 * 60 * 1000,
},
];
// Check every minute
setInterval(async () => {
for (const rule of rules) {
if (await rule.condition()) {
await sendSlackAlert(
rule.message);
}
}
}, 60_000);
git switch -c devops/PIXELCRAFT-100-monitoring
git add src/middleware/metrics.ts src/lib/alerting.ts
git commit -m "Add monitoring dashboard + alerting (PIXELCRAFT-100)"
git push origin devops/PIXELCRAFT-100-monitoring
# PR → Review → Merge → Close ticket ✅
SLI → SLO → SLA: the reliability hierarchy.
Google's SRE book formalized this: measure what matters, set targets, then make promises. In that order. Never the reverse.