feat: add cronjob for node backups

- Create a cronjob to back up node1 data to node8
- Define schedule for daily backups at 3:00 AM
- Include error handling and notifications via Feishu
- Use SSH and rsync for secure and efficient data transfer

This commit introduces a new cronjob that automates the backup process
for node1 to node8, enabling easier management and recovery of data.
The setup includes necessary security measures and proper logging of backups,
ensuring smoother operation and notifications in case of failures.
This commit is contained in:
songtianlun 2025-04-22 14:04:42 +08:00
parent ae8bb36d48
commit ba98dbb1b8
8 changed files with 157 additions and 34 deletions

View File

@ -0,0 +1,96 @@
piVersion: batch/v1
kind: CronJob
metadata:
name: backup-node1-to-node8
spec:
schedule: "0 3 * * *" # 每天凌晨 3:00
jobTemplate:
spec:
template:
spec:
containers:
- name: rsync-backup
image: ubuntu:latest # 使用一个包含必要工具的镜像,如果 ubuntu:latest 没有,需要构建或找其他镜像
# 或者使用一个预装了 rsync, ssh, curl 的镜像,例如 eeacms/rsync
# image: eeacms/rsync # 这个镜像可能需要调整脚本路径和用户
command: ["/bin/bash", "-c"]
args:
- |
apt-get update && apt-get install -y --no-install-recommends openssh-client rsync curl jq ; \
echo "Starting backup script for node $MY_NODE_NAME to node8..." ; \
START_TIME=$(date +%s) ; \
SOURCE_DIR="/host-data" ; \
DEST_NODE="node8" ; \
DEST_BASE_DIR="/data/backups" ; \
DEST_DIR="$DEST_BASE_DIR/$MY_NODE_NAME/data" ; \
SSH_KEY_PATH="/root/.ssh/id_rsa" ; \
FEISHU_WEBHOOK_URL=$(cat /etc/feishu-webhook/url) ; \
echo "Ensuring SSH directory exists and setting permissions..." ; \
mkdir -p /root/.ssh && chmod 700 /root/.ssh ; \
cp /etc/ssh-key/id_rsa $SSH_KEY_PATH ; \
chmod 600 $SSH_KEY_PATH ; \
echo "Running rsync..." ; \
rsync -avz --delete -e "ssh -i $SSH_KEY_PATH -o StrictHostKeyChecking=no" \
--rsync-path="mkdir -p $DEST_DIR && rsync" \
$SOURCE_DIR/ root@$DEST_NODE:$DEST_DIR ; \
RSYNC_EXIT_CODE=$? ; \
if [ $RSYNC_EXIT_CODE -eq 0 ]; then \
echo "Rsync completed successfully." ; \
END_TIME=$(date +%s) ; \
DURATION=$((END_TIME - START_TIME)) ; \
SOURCE_SIZE=$(du -sh $SOURCE_DIR | cut -f1) ; \
echo "Source directory size: $SOURCE_SIZE" ; \
echo "Backup duration: $DURATION seconds" ; \
MSG_TITLE="✅ [K3s Backup] $MY_NODE_NAME Backup Success" ; \
MSG_TEXT="Host: $MY_NODE_NAME\nSource: /data\nDestination Node: $DEST_NODE\nDestination Path: $DEST_DIR\nSource Size: $SOURCE_SIZE\nDuration: ${DURATION}s" ; \
JSON_PAYLOAD=$(jq -n \
--arg title "$MSG_TITLE" \
--arg text "$MSG_TEXT" \
'{msg_type: "post", content: {post: {zh_cn: {title: $title, content: [[{tag: "text", text: $text}]]}}}}') ; \
echo "Sending notification to Feishu..." ; \
curl -X POST -H "Content-Type: application/json" -d "$JSON_PAYLOAD" "$FEISHU_WEBHOOK_URL" ; \
echo "Notification sent." ; \
else \
echo "Rsync failed with exit code $RSYNC_EXIT_CODE." ; \
END_TIME=$(date +%s) ; \
DURATION=$((END_TIME - START_TIME)) ; \
MSG_TITLE="❌ [K3s Backup] $MY_NODE_NAME Backup Failed!" ; \
MSG_TEXT="Host: $MY_NODE_NAME\nSource: /data\nDestination Node: $DEST_NODE\nDestination Path: $DEST_DIR\nRsync Exit Code: $RSYNC_EXIT_CODE\nDuration: ${DURATION}s" ; \
JSON_PAYLOAD=$(jq -n \
--arg title "$MSG_TITLE" \
--arg text "$MSG_TEXT" \
'{msg_type: "post", content: {post: {zh_cn: {title: $title, content: [[{tag: "text", text: $text}]]}}}}') ; \
echo "Sending failure notification to Feishu..." ; \
curl -X POST -H "Content-Type: application/json" -d "$JSON_PAYLOAD" "$FEISHU_WEBHOOK_URL" ; \
exit 1 ; \
fi
env:
- name: MY_NODE_NAME # 使用 Downward API 获取 Pod 运行的节点名
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: data-volume
mountPath: /host-data # 将宿主机的 /data 挂载到容器的 /host-data
- name: ssh-key-volume
mountPath: /etc/ssh-key # 挂载 SSH 私钥 Secret
readOnly: true
- name: feishu-webhook-volume
mountPath: /etc/feishu-webhook # 挂载飞书 Webhook Secret
readOnly: true
volumes:
- name: data-volume
hostPath:
path: /data # 宿主机上的 /data 目录
type: Directory # 确保它是一个目录
- name: ssh-key-volume
secret:
secretName: rsync-ssh-key
defaultMode: 0400 # 设置权限,虽然脚本会重新设置,但这里设置更安全
- name: feishu-webhook-volume
secret:
secretName: feishu-webhook
restartPolicy: OnFailure # Job 失败时 Pod 不会重启,由 CronJob 控制重试(如果配置了)
nodeSelector:
kubernetes.io/hostname: node1 # !!! 关键:确保这个 Pod 只在 node1 上运行 !!!
# --- 对于 node2 到 node7你需要复制这个文件并修改这里的 node 名称 ---

View File

@ -0,0 +1,9 @@
apiVersion: v1
data:
# https://open.feishu.cn/open-apis/bot/v2/hook/f2a8d634-6a90-4f86-ac2f-ef6a53dbd680
url: aHR0cHM6Ly9vcGVuLmZlaXNodS5jbi9vcGVuLWFwaXMvYm90L3YyL2hvb2svZjJhOGQ2MzQtNmE5MC00Zjg2LWFjMmYtZWY2YTUzZGJkNjgw
kind: Secret
metadata:
name: feishu-webhook
namespace: backups
type: Opaque

View File

@ -0,0 +1,9 @@
apiVersion: v1
data:
id_rsa: LS0tLS1CRUdJTiBPUEVOU1NIIFBSSVZBVEUgS0VZLS0tLS0KYjNCbGJuTnphQzFyWlhrdGRqRUFBQUFBQkc1dmJtVUFBQUFFYm05dVpRQUFBQUFBQUFBQkFBQUFNd0FBQUF0emMyZ3RaVwpReU5UVXhPUUFBQUNBZkhxWnJDc0Q2MWxxa2dYVjZZelUwTWRRSDR0NEIwSFBTZzl5MjVwa0tGUUFBQUtoUVEyakRVRU5vCnd3QUFBQXR6YzJndFpXUXlOVFV4T1FBQUFDQWZIcVpyQ3NENjFscWtnWFY2WXpVME1kUUg0dDRCMEhQU2c5eTI1cGtLRlEKQUFBRUR2VW9aSmxrVUpCaVJMVHVwOFVML3VDdU9ndE92djV0UFE1UEtqK3lpZm1oOGVwbXNLd1ByV1dxU0JkWHBqTlRReAoxQWZpM2dIUWM5S0QzTGJtbVFvVkFBQUFKWE52Ym1kMGFXRnViSFZ1UUhOeVpDMXpiMjVuZEdsaGJteDFiaTFoY21Ob2JHCmx1ZFhnPQotLS0tLUVORCBPUEVOU1NIIFBSSVZBVEUgS0VZLS0tLS0K
kind: Secret
metadata:
creationTimestamp: "2025-04-22T05:53:08Z"
name: rsync-ssh-key
namespace: backups
type: Opaque

View File

@ -0,0 +1,8 @@
-----BEGIN OPENSSH PRIVATE KEY-----
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
QyNTUxOQAAACAfHqZrCsD61lqkgXV6YzU0MdQH4t4B0HPSg9y25pkKFQAAAKhQQ2jDUENo
wwAAAAtzc2gtZWQyNTUxOQAAACAfHqZrCsD61lqkgXV6YzU0MdQH4t4B0HPSg9y25pkKFQ
AAAEDvUoZJlkUJBiRLTup8UL/uCuOgtOvv5tPQ5PKj+yifmh8epmsKwPrWWqSBdXpjNTQx
1Afi3gHQc9KD3LbmmQoVAAAAJXNvbmd0aWFubHVuQHNyZC1zb25ndGlhbmx1bi1hcmNobG
ludXg=
-----END OPENSSH PRIVATE KEY-----

View File

@ -0,0 +1 @@
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIB8epmsKwPrWWqSBdXpjNTQx1Afi3gHQc9KD3LbmmQoV songtianlun@k3s.rsync.backups

View File

@ -0,0 +1,33 @@
apiVersion: v1
kind: Namespace
metadata:
name: backup-system
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: node-backup-job
namespace: backup-system
spec:
# 每天凌晨2点运行
schedule: "0 2 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
ttlSecondsAfterFinished: 86400 # 1天后删除已完成的任务
template:
spec:
serviceAccountName: backup-service-account
nodeSelector:
kubernetes.io/hostname: "vkvm-us1"
containers:
- name: backup-trigger
image: bitnami/kubectl:latest
command:
- /bin/sh
- -c
- |
kubectl label daemonset/node-backup-daemon trigger-backup=true --overwrite -n backup-system && \
sleep 60 && \
kubectl label daemonset/node-backup-daemon trigger-backup- -n backup-system
restartPolicy: OnFailure

View File

@ -1,36 +1,3 @@
apiVersion: v1
kind: Namespace
metadata:
name: backup-system
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: node-backup-job
namespace: backup-system
spec:
# 每天凌晨2点运行
schedule: "0 2 * * *"
concurrencyPolicy: Forbid
jobTemplate:
spec:
ttlSecondsAfterFinished: 86400 # 1天后删除已完成的任务
template:
spec:
serviceAccountName: backup-service-account
nodeSelector:
kubernetes.io/hostname: "vkvm-us1"
containers:
- name: backup-trigger
image: bitnami/kubectl:latest
command:
- /bin/sh
- -c
- |
kubectl label daemonset/node-backup-daemon trigger-backup=true --overwrite -n backup-system && \
sleep 60 && \
kubectl label daemonset/node-backup-daemon trigger-backup- -n backup-system
restartPolicy: OnFailure
---
apiVersion: apps/v1
kind: DaemonSet
@ -69,7 +36,7 @@ spec:
bash /scripts/backup.sh
echo "备份完成"
fi
date
# 每分钟检查一次
sleep 60
done