Initial commit

This commit is contained in:
Digital Studium 2024-02-25 18:24:22 +03:00
commit bffccfcde7
7 changed files with 343 additions and 0 deletions

9
backup_etcd.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/bash
set -e
trap "echo 'sending fail backup status to vmagent...' && curl -k -d 'etcd_backup,hostname=ml-cbt-01 status=0.0' -X POST https://<vmagent-url>/write" ERR # отсылаем алёрт в VictoriaMetrics, если одна из команд была неуспешной
cd /share/kubernetes/backups/etcd/ # переходим в папку с бэкапами
timestamp=$(date +"%Y-%m-%d-%H-%M-%S")
ETCDCTL_API=3 /usr/bin/etcdctl --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key --endpoints=https://127.0.0.1:2379 snapshot save $timestamp.db # бэкапим etcd
ETCDCTL_API=3 etcdctl --write-out=table snapshot status $timestamp.db # проверяем, что с бэкапом всё ок
rm `ls -t | awk 'NR>7'` # оставляем только 7 последних бэкапов, остальные удаляем
echo 'sending success backup status to vmagent...' && curl -k -d 'etcd_backup,hostname=ml-cbt-01 status=1' -X POST https://<vmagent-url>/write # отправляем информацию об успешном бэкапе. В результате получится метрика etcd_backup_status со значением 1

19
convert_submodule_to_folder.sh Executable file
View File

@ -0,0 +1,19 @@
# $1 - path/to/module/
# $2 - number-of-submodule
# $3 - submodule merging branch
# $4 - repo name
git remote rm submodule_origin
git rm $1
git commit -m "Remove $4 submodule"
git remote add submodule_origin ssh://<repo-url>/$4.git
git fetch submodule_origin
git lfs fetch submodule_origin --all
git branch merge-branch-$2 submodule_origin/$3
git checkout merge-branch-$2
git lfs fetch submodule_origin --all
mkdir -p $1
git ls-tree -z --name-only HEAD | xargs -0 -I {} git mv {} $1
git commit -m "Moved files to $1"
git checkout feature/merge-submodules
git merge --allow-unrelated-histories merge-branch-$2
git push --set-upstream origin feature/merge-submodules

69
create_systemd_service.py Executable file
View File

@ -0,0 +1,69 @@
#!/usr/bin/env python3
import sys, os, subprocess
help_text = f"""
Usage: {os.path.basename(__file__)} $1 $2 $3
required:
$1 - name of service
$2 - absolute path to script
optional:
$3 - description of service
"""
number_of_required_arguments = 2
number_of_optional_arguments = 1
if len(sys.argv) == 1:
print(help_text)
sys.exit()
elif sys.argv[1] in ["help", "-h"]:
print(help_text)
sys.exit()
elif len(sys.argv) - 1 < number_of_required_arguments:
print(f"You provided not enough arguments")
print(help_text)
sys.exit(1)
elif len(sys.argv) > number_of_required_arguments + number_of_optional_arguments + 1:
print(f"You provided extra arguments")
print(help_text)
sys.exit(1)
name_of_service = sys.argv[1]
path_to_script = sys.argv[2]
description = sys.argv[3] if sys.argv[3:4] else "" # empty if no description
if not os.path.isabs(path_to_script):
print("Path to script should be absolute!")
print(help_text)
sys.exit(1)
elif not os.path.isfile(path_to_script):
print("Path to script should exist and must be file!")
print(help_text)
sys.exit(1)
service_file = f"""
[Unit]
Description={description}
After=network-online.target
[Service]
Type=oneshot
ExecStart={path_to_script}
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
"""
try:
with open(f"/lib/systemd/system/{name_of_service}.service", "w") as f:
f.write(service_file)
subprocess.run(f"chmod +x {path_to_script}", shell=True, check=True)
os.system(f"systemctl enable --now {name_of_service} && echo Success!!!")
except:
print("Something went wrong...")
sys.exit(1)

51
create_systemd_timer.py Executable file
View File

@ -0,0 +1,51 @@
#!/usr/bin/env python3
import sys, os, subprocess
help_text = f"""
Usage: {os.path.basename(__file__)} $1 $2
required:
$1 - name of service
$2 - calendar ()
"""
number_of_required_arguments = 2
number_of_optional_arguments = 0
if len(sys.argv) == 1:
print(help_text)
sys.exit()
elif sys.argv[1] in ["help", "-h"]:
print(help_text)
elif len(sys.argv) - 1 < number_of_required_arguments:
print(f"You provided not enough arguments")
print(help_text)
sys.exit(1)
elif len(sys.argv) > number_of_required_arguments + number_of_optional_arguments + 1:
print(f"You provided extra arguments")
print(help_text)
sys.exit(1)
name_of_service = sys.argv[1]
calendar = sys.argv[2]
timer_file = f"""
[Unit]
Description={name_of_service} timer
[Timer]
Unit={name_of_service}.service
OnCalendar={calendar}
[Install]
WantedBy=timers.target
"""
try:
with open(f"/lib/systemd/system/{name_of_service}.timer", "w") as f:
f.write(timer_file)
os.system(f"systemctl enable --now {name_of_service}.timer && echo Success!!!")
except:
print("Something went wrong...")
sys.exit(1)

7
ds_up_down.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
if [ $1 = "down" ]; then
KUBECONFIG=/etc/kubernetes/admin.conf kubectl -n $2 patch daemonset $3 -p '{"spec": {"template": {"spec": {"nodeSelector": {"non-existing": "true"}}}}}'
elif [ $1 = "up" ]; then
KUBECONFIG=/etc/kubernetes/admin.conf kubectl -n $2 patch daemonset $3 --type json -p='[{"op": "remove", "path": "/spec/template/spec/nodeSelector/non-existing"}]'
fi

144
kube_ha.sh Executable file
View File

@ -0,0 +1,144 @@
# Add Docker's official GPG key:
sudo apt-get update
sudo apt-get install ca-certificates curl nfs-common # nfs-common needed for nfs-client storage class
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
# Add the repository to Apt sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
# install docker/containerd
sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
# install kubelet kubeadm kubectl
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo apt update
sudo apt install -y kubelet kubeadm kubectl
apt-mark hold kubelet kubeadm kubectl
# install nvidia-container-toolkit and make it default runtime
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt update && sudo apt install nvidia-container-toolkit -y
cat << EOF > /etc/containerd/config.toml
version = 2
[plugins]
[plugins."io.containerd.grpc.v1.cri"]
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
systemdCgroup = true
EOF
sudo service containerd restart
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
# install nginx for load balancing
sudo apt install nginx-light libnginx-mod-stream -y
cat << EOF > /etc/nginx/nginx.conf
error_log stderr notice;
load_module /lib/nginx/modules/ngx_stream_module.so;
worker_processes auto;
worker_rlimit_nofile 130048;
worker_shutdown_timeout 10s;
events {
multi_accept on;
use epoll;
worker_connections 16384;
}
stream {
upstream kube_apiserver {
least_conn;
server 127.0.0.1:6443;
server 10.239.10.222:6443;
server 10.239.10.223:6443;
}
server {
listen 127.0.0.1:8080;
proxy_pass kube_apiserver;
proxy_timeout 10m;
proxy_connect_timeout 1s;
}
}
http {
aio threads;
aio_write on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 5m;
keepalive_requests 100;
reset_timedout_connection on;
server_tokens off;
autoindex off;
server {
listen 8081;
location /healthz {
access_log off;
return 200;
}
location /stub_status {
stub_status on;
access_log off;
}
}
}
EOF
sudo service nginx restart
kubeadm init --upload-certs --control-plane-endpoint=127.0.0.1:8080
kubeadm join 127.0.0.1:8080 --token e01u52.c9uq77rkvl3qm86u --discovery-token-ca-cert-hash sha256:a7fc076bdcd7391e8bc7577b54ecc492d319298b5699293f8390042b57866700 --control-plane --certificate-key 2931030dff3041c185715298ab833895c6f36028a97b2f139857641bbe7f66b5 # master
kubeadm join 127.0.0.1:8080 --token e01u52.c9uq77rkvl3qm86u --discovery-token-ca-cert-hash sha256:a7fc076bdcd7391e8bc7577b54ecc492d319298b5699293f8390042b57866700 # worker
# install cilium
CILIUM_CLI_VERSION=$(curl -s https://raw.githubusercontent.com/cilium/cilium-cli/main/stable.txt)
CLI_ARCH=amd64
if [ "$(uname -m)" = "aarch64" ]; then CLI_ARCH=arm64; fi
curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/download/${CILIUM_CLI_VERSION}/cilium-linux-${CLI_ARCH}.tar.gz{,.sha256sum}
sha256sum --check cilium-linux-${CLI_ARCH}.tar.gz.sha256sum
sudo tar xzvfC cilium-linux-${CLI_ARCH}.tar.gz /usr/local/bin
rm cilium-linux-${CLI_ARCH}.tar.gz{,.sha256sum}
cilium install --version 1.15.1
# taint masters
kubectl taint nodes ml-cbt-02 ml-cbt-03 node-role.kubernetes.io/control-plane:NoSchedule-# install tools
# install tools
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash # helm
helm plugin install https://github.com/databus23/helm-diff # helm-diff needed for helmfile

44
prepare.sh Executable file
View File

@ -0,0 +1,44 @@
nala install -y sssd-ldap sssd-tools ldap-utils
mkdir /etc/ldap/ca/
#vim /etc/ldap/ca/ninv.crt
#vim /etc/ldap/ldap.conf
#vim /etc/hosts
#vim /etc/sssd/sssd.conf
chmod 600 /etc/sssd/sssd.conf
pam-auth-update --enable mkhomedir
service sssd restart
nala install -y apt-transport-https ca-certificates curl gpg
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list
nala update
nala install -y kubelet kubeadm kubectl
apt-mark hold kubelet kubeadm kubectl
nala install -y containerd
swapoff -a
#vim /etc/fstab
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF
modprobe overlay
modprobe br_netfilter
# sysctl params required by setup, params persist across reboots
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
# Apply sysctl params without reboot
sudo sysctl --system
kubeadm join 10.239.10.221:6443 --token z51k9o.144c6ntyob9ut43y --discovery-token-ca-cert-hash sha256:baaa860fb0cf4007b31979e0e21fdc45ec12ad2857aba3a82b63ec26044da597
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
nala update && nala install nvidia-driver-535 nvidia-cuda-toolkit nvidia-container-toolkit -y
nvidia-ctk runtime configure --runtime=containerd
service containerd restart