Initial commit
This commit is contained in:
commit
bffccfcde7
|
@ -0,0 +1,9 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
trap "echo 'sending fail backup status to vmagent...' && curl -k -d 'etcd_backup,hostname=ml-cbt-01 status=0.0' -X POST https://<vmagent-url>/write" ERR # отсылаем алёрт в VictoriaMetrics, если одна из команд была неуспешной
|
||||
cd /share/kubernetes/backups/etcd/ # переходим в папку с бэкапами
|
||||
timestamp=$(date +"%Y-%m-%d-%H-%M-%S")
|
||||
ETCDCTL_API=3 /usr/bin/etcdctl --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key --endpoints=https://127.0.0.1:2379 snapshot save $timestamp.db # бэкапим etcd
|
||||
ETCDCTL_API=3 etcdctl --write-out=table snapshot status $timestamp.db # проверяем, что с бэкапом всё ок
|
||||
rm `ls -t | awk 'NR>7'` # оставляем только 7 последних бэкапов, остальные удаляем
|
||||
echo 'sending success backup status to vmagent...' && curl -k -d 'etcd_backup,hostname=ml-cbt-01 status=1' -X POST https://<vmagent-url>/write # отправляем информацию об успешном бэкапе. В результате получится метрика etcd_backup_status со значением 1
|
|
@ -0,0 +1,19 @@
|
|||
# $1 - path/to/module/
|
||||
# $2 - number-of-submodule
|
||||
# $3 - submodule merging branch
|
||||
# $4 - repo name
|
||||
git remote rm submodule_origin
|
||||
git rm $1
|
||||
git commit -m "Remove $4 submodule"
|
||||
git remote add submodule_origin ssh://<repo-url>/$4.git
|
||||
git fetch submodule_origin
|
||||
git lfs fetch submodule_origin --all
|
||||
git branch merge-branch-$2 submodule_origin/$3
|
||||
git checkout merge-branch-$2
|
||||
git lfs fetch submodule_origin --all
|
||||
mkdir -p $1
|
||||
git ls-tree -z --name-only HEAD | xargs -0 -I {} git mv {} $1
|
||||
git commit -m "Moved files to $1"
|
||||
git checkout feature/merge-submodules
|
||||
git merge --allow-unrelated-histories merge-branch-$2
|
||||
git push --set-upstream origin feature/merge-submodules
|
|
@ -0,0 +1,69 @@
|
|||
#!/usr/bin/env python3
|
||||
import sys, os, subprocess
|
||||
|
||||
help_text = f"""
|
||||
Usage: {os.path.basename(__file__)} $1 $2 $3
|
||||
|
||||
required:
|
||||
$1 - name of service
|
||||
$2 - absolute path to script
|
||||
|
||||
optional:
|
||||
$3 - description of service
|
||||
"""
|
||||
number_of_required_arguments = 2
|
||||
number_of_optional_arguments = 1
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print(help_text)
|
||||
sys.exit()
|
||||
elif sys.argv[1] in ["help", "-h"]:
|
||||
print(help_text)
|
||||
sys.exit()
|
||||
elif len(sys.argv) - 1 < number_of_required_arguments:
|
||||
print(f"You provided not enough arguments")
|
||||
print(help_text)
|
||||
sys.exit(1)
|
||||
elif len(sys.argv) > number_of_required_arguments + number_of_optional_arguments + 1:
|
||||
print(f"You provided extra arguments")
|
||||
print(help_text)
|
||||
sys.exit(1)
|
||||
|
||||
name_of_service = sys.argv[1]
|
||||
path_to_script = sys.argv[2]
|
||||
description = sys.argv[3] if sys.argv[3:4] else "" # empty if no description
|
||||
|
||||
if not os.path.isabs(path_to_script):
|
||||
print("Path to script should be absolute!")
|
||||
print(help_text)
|
||||
sys.exit(1)
|
||||
elif not os.path.isfile(path_to_script):
|
||||
print("Path to script should exist and must be file!")
|
||||
print(help_text)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
service_file = f"""
|
||||
[Unit]
|
||||
Description={description}
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart={path_to_script}
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(f"/lib/systemd/system/{name_of_service}.service", "w") as f:
|
||||
f.write(service_file)
|
||||
subprocess.run(f"chmod +x {path_to_script}", shell=True, check=True)
|
||||
os.system(f"systemctl enable --now {name_of_service} && echo Success!!!")
|
||||
except:
|
||||
print("Something went wrong...")
|
||||
sys.exit(1)
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
#!/usr/bin/env python3
|
||||
import sys, os, subprocess
|
||||
|
||||
help_text = f"""
|
||||
Usage: {os.path.basename(__file__)} $1 $2
|
||||
|
||||
required:
|
||||
$1 - name of service
|
||||
$2 - calendar ()
|
||||
"""
|
||||
|
||||
number_of_required_arguments = 2
|
||||
number_of_optional_arguments = 0
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print(help_text)
|
||||
sys.exit()
|
||||
elif sys.argv[1] in ["help", "-h"]:
|
||||
print(help_text)
|
||||
elif len(sys.argv) - 1 < number_of_required_arguments:
|
||||
print(f"You provided not enough arguments")
|
||||
print(help_text)
|
||||
sys.exit(1)
|
||||
elif len(sys.argv) > number_of_required_arguments + number_of_optional_arguments + 1:
|
||||
print(f"You provided extra arguments")
|
||||
print(help_text)
|
||||
sys.exit(1)
|
||||
|
||||
name_of_service = sys.argv[1]
|
||||
calendar = sys.argv[2]
|
||||
|
||||
timer_file = f"""
|
||||
[Unit]
|
||||
Description={name_of_service} timer
|
||||
|
||||
[Timer]
|
||||
Unit={name_of_service}.service
|
||||
OnCalendar={calendar}
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(f"/lib/systemd/system/{name_of_service}.timer", "w") as f:
|
||||
f.write(timer_file)
|
||||
os.system(f"systemctl enable --now {name_of_service}.timer && echo Success!!!")
|
||||
except:
|
||||
print("Something went wrong...")
|
||||
sys.exit(1)
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
if [ $1 = "down" ]; then
|
||||
KUBECONFIG=/etc/kubernetes/admin.conf kubectl -n $2 patch daemonset $3 -p '{"spec": {"template": {"spec": {"nodeSelector": {"non-existing": "true"}}}}}'
|
||||
elif [ $1 = "up" ]; then
|
||||
KUBECONFIG=/etc/kubernetes/admin.conf kubectl -n $2 patch daemonset $3 --type json -p='[{"op": "remove", "path": "/spec/template/spec/nodeSelector/non-existing"}]'
|
||||
fi
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
# Add Docker's official GPG key:
|
||||
sudo apt-get update
|
||||
sudo apt-get install ca-certificates curl nfs-common # nfs-common needed for nfs-client storage class
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||
|
||||
# Add the repository to Apt sources:
|
||||
echo \
|
||||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
sudo apt-get update
|
||||
|
||||
# install docker/containerd
|
||||
sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||
|
||||
# install kubelet kubeadm kubectl
|
||||
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
|
||||
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list
|
||||
sudo apt update
|
||||
sudo apt install -y kubelet kubeadm kubectl
|
||||
apt-mark hold kubelet kubeadm kubectl
|
||||
|
||||
# install nvidia-container-toolkit and make it default runtime
|
||||
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
|
||||
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
|
||||
sudo apt update && sudo apt install nvidia-container-toolkit -y
|
||||
cat << EOF > /etc/containerd/config.toml
|
||||
version = 2
|
||||
|
||||
[plugins]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri"]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||
default_runtime_name = "nvidia"
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
|
||||
privileged_without_host_devices = false
|
||||
runtime_engine = ""
|
||||
runtime_root = ""
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
|
||||
BinaryName = "/usr/bin/nvidia-container-runtime"
|
||||
systemdCgroup = true
|
||||
EOF
|
||||
sudo service containerd restart
|
||||
|
||||
sudo nvidia-ctk runtime configure --runtime=docker
|
||||
sudo systemctl restart docker
|
||||
# install nginx for load balancing
|
||||
sudo apt install nginx-light libnginx-mod-stream -y
|
||||
|
||||
cat << EOF > /etc/nginx/nginx.conf
|
||||
error_log stderr notice;
|
||||
load_module /lib/nginx/modules/ngx_stream_module.so;
|
||||
|
||||
worker_processes auto;
|
||||
worker_rlimit_nofile 130048;
|
||||
worker_shutdown_timeout 10s;
|
||||
|
||||
events {
|
||||
multi_accept on;
|
||||
use epoll;
|
||||
worker_connections 16384;
|
||||
}
|
||||
|
||||
stream {
|
||||
upstream kube_apiserver {
|
||||
least_conn;
|
||||
server 127.0.0.1:6443;
|
||||
server 10.239.10.222:6443;
|
||||
server 10.239.10.223:6443;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 127.0.0.1:8080;
|
||||
proxy_pass kube_apiserver;
|
||||
proxy_timeout 10m;
|
||||
proxy_connect_timeout 1s;
|
||||
}
|
||||
}
|
||||
|
||||
http {
|
||||
aio threads;
|
||||
aio_write on;
|
||||
tcp_nopush on;
|
||||
tcp_nodelay on;
|
||||
|
||||
keepalive_timeout 5m;
|
||||
keepalive_requests 100;
|
||||
reset_timedout_connection on;
|
||||
server_tokens off;
|
||||
autoindex off;
|
||||
|
||||
server {
|
||||
listen 8081;
|
||||
location /healthz {
|
||||
access_log off;
|
||||
return 200;
|
||||
}
|
||||
location /stub_status {
|
||||
stub_status on;
|
||||
access_log off;
|
||||
}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
sudo service nginx restart
|
||||
kubeadm init --upload-certs --control-plane-endpoint=127.0.0.1:8080
|
||||
|
||||
kubeadm join 127.0.0.1:8080 --token e01u52.c9uq77rkvl3qm86u --discovery-token-ca-cert-hash sha256:a7fc076bdcd7391e8bc7577b54ecc492d319298b5699293f8390042b57866700 --control-plane --certificate-key 2931030dff3041c185715298ab833895c6f36028a97b2f139857641bbe7f66b5 # master
|
||||
|
||||
kubeadm join 127.0.0.1:8080 --token e01u52.c9uq77rkvl3qm86u --discovery-token-ca-cert-hash sha256:a7fc076bdcd7391e8bc7577b54ecc492d319298b5699293f8390042b57866700 # worker
|
||||
|
||||
# install cilium
|
||||
CILIUM_CLI_VERSION=$(curl -s https://raw.githubusercontent.com/cilium/cilium-cli/main/stable.txt)
|
||||
CLI_ARCH=amd64
|
||||
if [ "$(uname -m)" = "aarch64" ]; then CLI_ARCH=arm64; fi
|
||||
curl -L --fail --remote-name-all https://github.com/cilium/cilium-cli/releases/download/${CILIUM_CLI_VERSION}/cilium-linux-${CLI_ARCH}.tar.gz{,.sha256sum}
|
||||
sha256sum --check cilium-linux-${CLI_ARCH}.tar.gz.sha256sum
|
||||
sudo tar xzvfC cilium-linux-${CLI_ARCH}.tar.gz /usr/local/bin
|
||||
rm cilium-linux-${CLI_ARCH}.tar.gz{,.sha256sum}
|
||||
|
||||
cilium install --version 1.15.1
|
||||
|
||||
# taint masters
|
||||
kubectl taint nodes ml-cbt-02 ml-cbt-03 node-role.kubernetes.io/control-plane:NoSchedule-# install tools
|
||||
|
||||
# install tools
|
||||
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash # helm
|
||||
helm plugin install https://github.com/databus23/helm-diff # helm-diff needed for helmfile
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
nala install -y sssd-ldap sssd-tools ldap-utils
|
||||
mkdir /etc/ldap/ca/
|
||||
#vim /etc/ldap/ca/ninv.crt
|
||||
#vim /etc/ldap/ldap.conf
|
||||
#vim /etc/hosts
|
||||
#vim /etc/sssd/sssd.conf
|
||||
chmod 600 /etc/sssd/sssd.conf
|
||||
pam-auth-update --enable mkhomedir
|
||||
service sssd restart
|
||||
nala install -y apt-transport-https ca-certificates curl gpg
|
||||
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
|
||||
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list
|
||||
nala update
|
||||
nala install -y kubelet kubeadm kubectl
|
||||
apt-mark hold kubelet kubeadm kubectl
|
||||
nala install -y containerd
|
||||
swapoff -a
|
||||
#vim /etc/fstab
|
||||
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
|
||||
overlay
|
||||
br_netfilter
|
||||
EOF
|
||||
|
||||
modprobe overlay
|
||||
modprobe br_netfilter
|
||||
|
||||
# sysctl params required by setup, params persist across reboots
|
||||
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
|
||||
net.bridge.bridge-nf-call-iptables = 1
|
||||
net.bridge.bridge-nf-call-ip6tables = 1
|
||||
net.ipv4.ip_forward = 1
|
||||
EOF
|
||||
|
||||
# Apply sysctl params without reboot
|
||||
sudo sysctl --system
|
||||
kubeadm join 10.239.10.221:6443 --token z51k9o.144c6ntyob9ut43y --discovery-token-ca-cert-hash sha256:baaa860fb0cf4007b31979e0e21fdc45ec12ad2857aba3a82b63ec26044da597
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
|
||||
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
nala update && nala install nvidia-driver-535 nvidia-cuda-toolkit nvidia-container-toolkit -y
|
||||
nvidia-ctk runtime configure --runtime=containerd
|
||||
service containerd restart
|
||||
|
Loading…
Reference in New Issue