# MongoDB Atlas (managed):
# Settings → Backup → Enable
#
# Continuous backup with:
# - Snapshots every 6 hours
# - Point-in-time recovery
# (oplog-based, any second)
# - Retained for 7 days
#
# Restore options:
# - Restore to same cluster
# - Restore to new cluster
# - Download snapshot
# Self-hosted MongoDB backup:
mongodump \
--uri="mongodb://localhost:27017" \
--db=pixelcraft \
--out=/backups/$(date +%Y-%m-%d) \
--gzip
# Output:
# /backups/2026-02-19/
# pixelcraft/users.bson.gz
# pixelcraft/images.bson.gz
# pixelcraft/metadata.json
#!/bin/bash
# scripts/backup.sh
set -euo pipefail
DATE=$(date +%Y-%m-%d_%H-%M)
BACKUP_DIR="/backups/${DATE}"
S3_BUCKET="s3://pixelcraft-backups"
RETENTION_DAYS=30
echo "📦 Starting backup: ${DATE}"
# 1. Dump MongoDB
mongodump \
--uri="${DATABASE_URL}" \
--out="${BACKUP_DIR}" \
--gzip
# 2. Backup uploaded images
# (only new files since yesterday)
aws s3 sync \
s3://pixelcraft-uploads \
"${BACKUP_DIR}/uploads" \
--only-show-errors
# 3. Upload backup to S3
tar -czf "${BACKUP_DIR}.tar.gz" \
"${BACKUP_DIR}"
aws s3 cp "${BACKUP_DIR}.tar.gz" \
"${S3_BUCKET}/${DATE}.tar.gz"
# 4. Verify upload
aws s3 ls \
"${S3_BUCKET}/${DATE}.tar.gz"
# 5. Clean up local + old S3
rm -rf "${BACKUP_DIR}" \
"${BACKUP_DIR}.tar.gz"
aws s3 ls "${S3_BUCKET}/" \
| while read -r line; do
FILE_DATE=$(echo "$line" \
| awk '{print $1}')
if [[ "${FILE_DATE}" < \
"$(date -d "-${RETENTION_DAYS} \
days" +%Y-%m-%d)" ]]; then
FILE=$(echo "$line" \
| awk '{print $4}')
aws s3 rm \
"${S3_BUCKET}/${FILE}"
fi
done
echo "✅ Backup complete: ${DATE}"
# Option A: Cron (on a server)
# Nightly at 3 AM UTC
0 3 * * * /opt/scripts/backup.sh \
>> /var/log/backup.log 2>&1
# Option B: GitHub Actions
# .github/workflows/backup.yml
name: Nightly Backup
on:
schedule:
- cron: '0 3 * * *' # 3 AM UTC
workflow_dispatch: # manual trigger
jobs:
backup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run backup
env:
DATABASE_URL:
${{ secrets.PROD_DB_URL }}
AWS_ACCESS_KEY_ID:
${{ secrets.AWS_KEY }}
AWS_SECRET_ACCESS_KEY:
${{ secrets.AWS_SECRET }}
run: bash scripts/backup.sh
- name: Notify on failure
if: failure()
run: |
curl -X POST \
"${{ secrets.SLACK_WEBHOOK }}" \
-d '{"text":"🔴 Backup FAILED"}'
#!/bin/bash
# scripts/restore-test.sh
set -euo pipefail
BACKUP_DATE=${1:-$(date +%Y-%m-%d)}
TEST_DB="pixelcraft-restore-test"
S3_BUCKET="s3://pixelcraft-backups"
echo "🔄 Testing restore: ${BACKUP_DATE}"
START=$(date +%s)
# 1. Download backup
aws s3 cp \
"${S3_BUCKET}/${BACKUP_DATE}.tar.gz" \
/tmp/restore.tar.gz
tar -xzf /tmp/restore.tar.gz \
-C /tmp/
# 2. Restore to test database
mongorestore \
--uri="mongodb://localhost:27017" \
--db="${TEST_DB}" \
--gzip \
"/tmp/backups/${BACKUP_DATE}/pixelcraft"
# 3. Verify data integrity
USERS=$(mongosh ${TEST_DB} \
--eval "db.users.countDocuments()")
IMAGES=$(mongosh ${TEST_DB} \
--eval "db.images.countDocuments()")
echo "Users restored: ${USERS}"
echo "Images restored: ${IMAGES}"
# 4. Verify minimum counts
if [ "$USERS" -lt 100 ]; then
echo "⚠️ User count low: ${USERS}"
exit 1
fi
# 5. Cleanup
mongosh ${TEST_DB} \
--eval "db.dropDatabase()"
END=$(date +%s)
DURATION=$((END - START))
echo "✅ Restore verified in ${DURATION}s"
echo " RTO estimate: ~${DURATION}s"
# DISASTER RECOVERY PLAN
# PixelCraft — Last updated: 2026-02
## Objectives
- RPO: ≤ 1 hour (max data loss)
- RTO: ≤ 2 hours (max downtime)
## Backup Schedule
- MongoDB Atlas: continuous + 6hr snaps
- S3 images: nightly sync
- Full backup: nightly 3AM UTC → S3
- Retention: 30 days
## Scenarios
### Database corruption / drop
1. Identify timestamp of incident
2. MongoDB Atlas → Restore →
Point-in-time → select timestamp
3. Restore to new cluster
4. Update DATABASE_URL env var
5. Verify data, switch traffic
### Server failure
1. Railway auto-restarts containers
2. If persistent: redeploy from main
3. Health check alerts in < 5 min
### S3 data loss
1. S3 versioning enabled
2. Restore previous versions
3. Or: restore from nightly backup
### Complete infrastructure failure
1. Provision new Railway project
2. Restore DB from S3 backup
3. Deploy from GitHub (main branch)
4. Update DNS records
5. Verify all services
## Contacts
- On-call: #ops-alerts Slack
- MongoDB Atlas support: [link]
- Railway status: status.railway.app
git switch -c devops/PIXELCRAFT-101-backups
git add scripts/ docs/disaster-recovery.md .github/workflows/
git commit -m "Add automated backups + DR plan (PIXELCRAFT-101)"
git push origin devops/PIXELCRAFT-101-backups
# PR → Review → Merge → Close ticket ✅
RPO and RTO define your backup strategy.
Every backup decision is a tradeoff between cost, complexity, and acceptable loss. The business decides how much loss is tolerable. Engineering builds the system to match.