Initial commit
This commit is contained in:
commit
bffccfcde7
|
@ -0,0 +1,9 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
trap "echo 'sending fail backup status to vmagent...' && curl -k -d 'etcd_backup,hostname=ml-cbt-01 status=0.0' -X POST https://<vmagent-url>/write" ERR # отсылаем алёрт в VictoriaMetrics, если одна из команд была неуспешной
|
||||||
|
cd /share/kubernetes/backups/etcd/ # переходим в папку с бэкапами
|
||||||
|
timestamp=$(date +"%Y-%m-%d-%H-%M-%S")
|
||||||
|
ETCDCTL_API=3 /usr/bin/etcdctl --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key --endpoints=https://127.0.0.1:2379 snapshot save $timestamp.db # бэкапим etcd
|
||||||
|
ETCDCTL_API=3 etcdctl --write-out=table snapshot status $timestamp.db # проверяем, что с бэкапом всё ок
|
||||||
|
rm `ls -t | awk 'NR>7'` # оставляем только 7 последних бэкапов, остальные удаляем
|
||||||
|
echo 'sending success backup status to vmagent...' && curl -k -d 'etcd_backup,hostname=ml-cbt-01 status=1' -X POST https://<vmagent-url>/write # отправляем информацию об успешном бэкапе. В результате получится метрика etcd_backup_status со значением 1
|
|
@ -0,0 +1,19 @@
|
||||||
|
# $1 - path/to/module/
|
||||||
|
# $2 - number-of-submodule
|
||||||
|
# $3 - submodule merging branch
|
||||||
|
# $4 - repo name
|
||||||
|
git remote rm submodule_origin
|
||||||
|
git rm $1
|
||||||
|
git commit -m "Remove $4 submodule"
|
||||||
|
git remote add submodule_origin ssh://<repo-url>/$4.git
|
||||||
|
git fetch submodule_origin
|
||||||
|
git lfs fetch submodule_origin --all
|
||||||
|
git branch merge-branch-$2 submodule_origin/$3
|
||||||
|
git checkout merge-branch-$2
|
||||||
|
git lfs fetch submodule_origin --all
|
||||||
|
mkdir -p $1
|
||||||
|
git ls-tree -z --name-only HEAD | xargs -0 -I {} git mv {} $1
|
||||||
|
git commit -m "Moved files to $1"
|
||||||
|
git checkout feature/merge-submodules
|
||||||
|
git merge --allow-unrelated-histories merge-branch-$2
|
||||||
|
git push --set-upstream origin feature/merge-submodules
|
|
@ -0,0 +1,69 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys, os, subprocess
|
||||||
|
|
||||||
|
help_text = f"""
|
||||||
|
Usage: {os.path.basename(__file__)} $1 $2 $3
|
||||||
|
|
||||||
|
required:
|
||||||
|
$1 - name of service
|
||||||
|
$2 - absolute path to script
|
||||||
|
|
||||||
|
optional:
|
||||||
|
$3 - description of service
|
||||||
|
"""
|
||||||
|
number_of_required_arguments = 2
|
||||||
|
number_of_optional_arguments = 1
|
||||||
|
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print(help_text)
|
||||||
|
sys.exit()
|
||||||
|
elif sys.argv[1] in ["help", "-h"]:
|
||||||
|
print(help_text)
|
||||||
|
sys.exit()
|
||||||
|
elif len(sys.argv) - 1 < number_of_required_arguments:
|
||||||
|
print(f"You provided not enough arguments")
|
||||||
|
print(help_text)
|
||||||
|
sys.exit(1)
|
||||||
|
elif len(sys.argv) > number_of_required_arguments + number_of_optional_arguments + 1:
|
||||||
|
print(f"You provided extra arguments")
|
||||||
|
print(help_text)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
name_of_service = sys.argv[1]
|
||||||
|
path_to_script = sys.argv[2]
|
||||||
|
description = sys.argv[3] if sys.argv[3:4] else "" # empty if no description
|
||||||
|
|
||||||
|
if not os.path.isabs(path_to_script):
|
||||||
|
print("Path to script should be absolute!")
|
||||||
|
print(help_text)
|
||||||
|
sys.exit(1)
|
||||||
|
elif not os.path.isfile(path_to_script):
|
||||||
|
print("Path to script should exist and must be file!")
|
||||||
|
print(help_text)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
service_file = f"""
|
||||||
|
[Unit]
|
||||||
|
Description={description}
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart={path_to_script}
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(f"/lib/systemd/system/{name_of_service}.service", "w") as f:
|
||||||
|
f.write(service_file)
|
||||||
|
subprocess.run(f"chmod +x {path_to_script}", shell=True, check=True)
|
||||||
|
os.system(f"systemctl enable --now {name_of_service} && echo Success!!!")
|
||||||
|
except:
|
||||||
|
print("Something went wrong...")
|
||||||
|
sys.exit(1)
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys, os, subprocess
|
||||||
|
|
||||||
|
help_text = f"""
|
||||||
|
Usage: {os.path.basename(__file__)} $1 $2
|
||||||
|
|
||||||
|
required:
|
||||||
|
$1 - name of service
|
||||||
|
$2 - calendar ()
|
||||||
|
"""
|
||||||
|
|
||||||
|
number_of_required_arguments = 2
|
||||||
|
number_of_optional_arguments = 0
|
||||||
|
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print(help_text)
|
||||||
|
sys.exit()
|
||||||
|
elif sys.argv[1] in ["help", "-h"]:
|
||||||
|
print(help_text)
|
||||||
|
elif len(sys.argv) - 1 < number_of_required_arguments:
|
||||||
|
print(f"You provided not enough arguments")
|
||||||
|
print(help_text)
|
||||||
|
sys.exit(1)
|
||||||
|
elif len(sys.argv) > number_of_required_arguments + number_of_optional_arguments + 1:
|
||||||
|
print(f"You provided extra arguments")
|
||||||
|
print(help_text)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
name_of_service = sys.argv[1]
|
||||||
|
calendar = sys.argv[2]
|
||||||
|
|
||||||
|
timer_file = f"""
|
||||||
|
[Unit]
|
||||||
|
Description={name_of_service} timer
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
Unit={name_of_service}.service
|
||||||
|
OnCalendar={calendar}
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(f"/lib/systemd/system/{name_of_service}.timer", "w") as f:
|
||||||
|
f.write(timer_file)
|
||||||
|
os.system(f"systemctl enable --now {name_of_service}.timer && echo Success!!!")
|
||||||
|
except:
|
||||||
|
print("Something went wrong...")
|
||||||
|
sys.exit(1)
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
#!/bin/bash
|
||||||
|
if [ $1 = "down" ]; then
|
||||||
|
KUBECONFIG=/etc/kubernetes/admin.conf kubectl -n $2 patch daemonset $3 -p '{"spec": {"template": {"spec": {"nodeSelector": {"non-existing": "true"}}}}}'
|
||||||
|
elif [ $1 = "up" ]; then
|
||||||
|
KUBECONFIG=/etc/kubernetes/admin.conf kubectl -n $2 patch daemonset $3 --type json -p='[{"op": "remove", "path": "/spec/template/spec/nodeSelector/non-existing"}]'
|
||||||
|
fi
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
# Add Docker's official GPG key:
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install ca-certificates curl nfs-common # nfs-common needed for nfs-client storage class
|
||||||
|
sudo install -m 0755 -d /etc/apt/keyrings
|
||||||
|
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||||
|
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||||
|
|
||||||
|
# Add the repository to Apt sources:
|
||||||
|
echo \
|
||||||
|
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||||
|
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||||
|
sudo apt-get update
|
||||||
|
|
||||||
|
# install docker/containerd
|
||||||
|
sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||||
|
|
||||||
|
# install kubelet kubeadm kubectl
|
||||||
|
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
|
||||||
|
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y kubelet kubeadm kubectl
|
||||||
|
apt-mark hold kubelet kubeadm kubectl
|
||||||
|
|
||||||
|
# install nvidia-container-toolkit and make it default runtime
|
||||||
|
|
||||||
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
|
||||||
|
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||||
|
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
|
|
||||||
|
sudo apt update && sudo apt install nvidia-container-toolkit -y
|
||||||
|
cat << EOF > /etc/containerd/config.toml
|
||||||
|
version = 2
|
||||||
|
|
||||||
|
[plugins]
|
||||||
|
|
||||||
|
[plugins."io.containerd.grpc.v1.cri"]
|
||||||
|
|
||||||
|
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||||
|
default_runtime_name = "nvidia"
|
||||||
|
|
||||||
|
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
|
||||||
|
|
||||||
|
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
|
||||||
|
privileged_without_host_devices = false
|
||||||
|
runtime_engine = ""
|
||||||
|
runtime_root = ""
|
||||||
|
runtime_type = "io.containerd.runc.v2"
|
||||||
|
|
||||||
|
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
|
||||||
|
BinaryName = "/usr/bin/nvidia-container-runtime"
|
||||||
|
systemdCgroup = true
|
||||||
|
EOF
|
||||||
|
sudo service containerd restart
|
||||||
|
|
||||||
|
sudo nvidia-ctk runtime configure --runtime=docker
|
||||||
|
sudo systemctl restart docker
|
||||||
|
# install nginx for load balancing
|
||||||
|
sudo apt install nginx-light libnginx-mod-stream -y
|
||||||
|
|
||||||
|
cat << EOF > /etc/nginx/nginx.conf
|
||||||
|
error_log stderr notice;
|
||||||
|
load_module /lib/nginx/modules/ngx_stream_module.so;
|
||||||
|
|
||||||
|
worker_processes auto;
|
||||||
|
worker_rlimit_nofile 130048;
|
||||||
|
worker_shutdown_timeout 10s;
|
||||||
|
|
||||||
|
events {
|
||||||
|
multi_accept on;
|
||||||
|
use epoll;
|
||||||
|
worker_connections 16384;
|
||||||
|
}
|
||||||
|
|
||||||
|
stream {
|
||||||
|
upstream kube_apiserver {
|
||||||
|
least_conn;
|
||||||
|
server 127.0.0.1:6443;
|
||||||
|
server 10.239.10.222:6443;
|
||||||
|
server 10.239.10.223:6443;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 127.0.0.1:8080;
|
||||||
|
proxy_pass kube_apiserver;
|
||||||
|
proxy_timeout 10m;
|
||||||
|
proxy_connect_timeout 1s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
http {
|
||||||
|
aio threads;
|
||||||
|
aio_write on;
|
||||||
|
tcp_nopush on;
|
||||||
|
tcp_nodelay on;
|
||||||
|
|
||||||
|
keepalive_timeout 5m;
|
||||||
|
keepalive_requests 100;
|
||||||
|
reset_timedout_connection on;
|
||||||
|
server_tokens off;
|
||||||
|
autoindex off;
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 8081;
|
||||||
|
location /healthz {
|
||||||
|
access_log off;
|
||||||
|
return 200;
|
||||||
|
}
|
||||||
|
location /stub_status {
|
||||||
|
stub_status on;
|
||||||
|
access_log off;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
sudo service nginx restart
|
||||||
|
kubeadm init --upload-certs --control-plane-endpoint=127.0.0.1:8080
|
||||||
|
|
||||||
|
kubeadm join 127.0.0.1:8080 --token e01u52.c9uq77rkvl3qm86u --discovery-token-ca-cert-hash sha256:a7fc076bdcd7391e8bc7577b54ecc492d319298b5699293f8390042b57866700 --control-plane --certificate-key 2931030dff3041c185715298ab833895c6f36028a97b2f139857641bbe7f66b5 # master
|
||||||
|
|
||||||
|
kubeadm join 127.0.0.1:8080 --token e01u52.c9uq77rkvl3qm86u --discovery-token-ca-cert-hash sha256:a7fc076bdcd7391e8bc7577b54ecc492d319298b5699293f8390042b57866700 # worker
|
||||||
|
|
||||||
|
# install cilium
|
||||||
|
CILIUM_CLI_VERSION=$(curl -s https://raw.githubusercontent.com/cilium/cilium-cli/main/stable.txt)
|
||||||
|
CLI_ARCH=amd64
|
||||||
|
if [ "$(uname -m)" = "aarch64" ]; then CLI_ARCH=arm64; fi
|
||||||
|
curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/download/${CILIUM_CLI_VERSION}/cilium-linux-${CLI_ARCH}.tar.gz{,.sha256sum}
|
||||||
|
sha256sum --check cilium-linux-${CLI_ARCH}.tar.gz.sha256sum
|
||||||
|
sudo tar xzvfC cilium-linux-${CLI_ARCH}.tar.gz /usr/local/bin
|
||||||
|
rm cilium-linux-${CLI_ARCH}.tar.gz{,.sha256sum}
|
||||||
|
|
||||||
|
cilium install --version 1.15.1
|
||||||
|
|
||||||
|
# taint masters
|
||||||
|
kubectl taint nodes ml-cbt-02 ml-cbt-03 node-role.kubernetes.io/control-plane:NoSchedule-# install tools
|
||||||
|
|
||||||
|
# install tools
|
||||||
|
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash # helm
|
||||||
|
helm plugin install https://github.com/databus23/helm-diff # helm-diff needed for helmfile
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
nala install -y sssd-ldap sssd-tools ldap-utils
|
||||||
|
mkdir /etc/ldap/ca/
|
||||||
|
#vim /etc/ldap/ca/ninv.crt
|
||||||
|
#vim /etc/ldap/ldap.conf
|
||||||
|
#vim /etc/hosts
|
||||||
|
#vim /etc/sssd/sssd.conf
|
||||||
|
chmod 600 /etc/sssd/sssd.conf
|
||||||
|
pam-auth-update --enable mkhomedir
|
||||||
|
service sssd restart
|
||||||
|
nala install -y apt-transport-https ca-certificates curl gpg
|
||||||
|
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
|
||||||
|
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list
|
||||||
|
nala update
|
||||||
|
nala install -y kubelet kubeadm kubectl
|
||||||
|
apt-mark hold kubelet kubeadm kubectl
|
||||||
|
nala install -y containerd
|
||||||
|
swapoff -a
|
||||||
|
#vim /etc/fstab
|
||||||
|
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
|
||||||
|
overlay
|
||||||
|
br_netfilter
|
||||||
|
EOF
|
||||||
|
|
||||||
|
modprobe overlay
|
||||||
|
modprobe br_netfilter
|
||||||
|
|
||||||
|
# sysctl params required by setup, params persist across reboots
|
||||||
|
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
|
||||||
|
net.bridge.bridge-nf-call-iptables = 1
|
||||||
|
net.bridge.bridge-nf-call-ip6tables = 1
|
||||||
|
net.ipv4.ip_forward = 1
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Apply sysctl params without reboot
|
||||||
|
sudo sysctl --system
|
||||||
|
kubeadm join 10.239.10.221:6443 --token z51k9o.144c6ntyob9ut43y --discovery-token-ca-cert-hash sha256:baaa860fb0cf4007b31979e0e21fdc45ec12ad2857aba3a82b63ec26044da597
|
||||||
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
|
||||||
|
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||||
|
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
|
nala update && nala install nvidia-driver-535 nvidia-cuda-toolkit nvidia-container-toolkit -y
|
||||||
|
nvidia-ctk runtime configure --runtime=containerd
|
||||||
|
service containerd restart
|
||||||
|
|
Loading…
Reference in New Issue