Commit d768cea2 by tingweiwang

更新k8s yaml

1 parent f20cebf0
Showing with 589 additions and 128 deletions
......@@ -60,8 +60,8 @@ spec:
cpu: "200m"
memory: "512Mi"
limits:
cpu: "1000m"
memory: "1024Mi"
cpu: "2000m"
memory: "3000Mi"
volumes:
- name: adl-volume
persistentVolumeClaim:
......
wapiVersion: v1
apiVersion: v1
kind: Service
metadata:
name: kpl--frontend
......
......@@ -48,8 +48,8 @@ spec:
cpu: "200m"
memory: "512Mi"
limits:
cpu: "1000m"
memory: "1024Mi"
cpu: "4000m"
memory: "4096Mi"
volumes:
- name: kpl-volume
persistentVolumeClaim:
......
......@@ -12,3 +12,7 @@ spec:
name: port-80
targetPort: 80
nodePort: 30180
- port: 81
name: port-81
targetPort: 81
nodePort: 30181
......@@ -53,9 +53,6 @@ spec:
- key: apiserver.conf
path: default.conf
- key: nginx.conf
path: nginx.conf
path: nginx.conf
- key: apiserver-edu.conf
path: default-edu.conf
......@@ -29,16 +29,21 @@ spec:
ports:
- containerPort: 9999
name: port-9999
- containerPort: 8080
name: wport-8080
livenessProbe:
tcpSocket:
port: port-9999
initialDelaySeconds: 10
httpGet:
path: /health
port: wport-8080
scheme: HTTP
resources:
requests:
cpu: "300m"
memory: "300Mi"
limits:
cpu: "1000m"
memory: "1024Mi"
cpu: "2000m"
memory: "2048Mi"
volumes:
- name: config
configMap:
......
......@@ -36,6 +36,8 @@ spec:
ports:
- containerPort: 8080
name: wport-8080
- containerPort: 8555
name: wport-8555
livenessProbe:
initialDelaySeconds: 10
httpGet:
......
# kpl-launcher部署文档
作者:刘弘也
## 依赖安装
本服务包含如下依赖
* Kubernetes 1.15+,支持CRD
* volcano v0.4.0
其中volcano可以使用本项目提供的部署脚本`k8s/volcano/*.yaml`进行安装,和官方版本的差异主要是把其镜像被搬运到了金山云镜像仓库,以加速部署速度:
* 安装volcano:`kubectl apply -f k8s/volcano`
* 检查`volcano-system`命名空间及其内部的Pod是否全部成功启动,如下所示:
```shell
$ kubectl get pods -n volcano-system
NAME READY STATUS RESTARTS AGE
volcano-admission-7b498d4d56-7djqg 1/1 Running 0 4d23h
volcano-admission-init-7sqzw 0/1 Completed 0 4d23h
volcano-controllers-68d55f9444-sq4vt 1/1 Running 0 4d23h
volcano-scheduler-7cc766767b-hvbvb 1/1 Running 0 4d23h
```
## kpl-launcher安装
目前kpl-launcher服务进行部署的过程包含如下内容
* (事先准备好的)命名空间:"kpl"
* `k8s/kpl-launcher/rbac.yaml`
* ServiceAccount:"kpl-launcher"
* ClusterRole:"kpl-launcher"
* ClusterRoleBinding:"kpl-launcher"
* `k8s/kpl-launcher/deployment.yaml`
* Deployment:"kpl-launcher"
* 服务启动命令的可选参数
```shell
$ kpl_launcher --help
Usage of ./build/bin/kpl_launcher:
-address string
service listening address (default "[::]")
-port int
service listening port (default 8000)
-private-key string
private key for ssl/tls secured service
-cert-chain string
certificate chain for ssl/tls secured service
-incluster
if use incluster config
-local-config string
(optional) absolute path to the kubeconfig file (default "~/.kube/config")
```
* Service (默认为ClusterIP类型):"kpl-launcher-service"
具体部署步骤如下:
* 联系相关开发负责人(刘弘也)确认当前要部署的镜像版本,如:`hub.kce.ksyun.com/aivc-kpl/kpl-launcher:launcher-9efebf5`
* 修改`k8s/kpl-launcher/deployment.yaml`中的部署镜像:
```shell
export IMAGE_NAME=hub.kce.ksyun.com/aivc-kpl/kpl-launcher:launcher-f2d3958
sed -i "s/image: .*/image: ${IMAGE_NAME}/" k8s/kpl-launcher/deployment.yaml
```
* 创建命名空间kpl
```shell
kubectl create ns kpl
```
* 准备好ssl/tls的证书文件
* 使用openssl生成自签名证书(注意指定CN):
```shell
mkdir certs
# openssl req -newkey rsa:2048 -nodes -keyout certs/server.key -x509 -days 3650 -out certs/server.crt -subj "/CN=KPL"
openssl ecparam -genkey -name secp384r1 -out server.key
openssl req -new -x509 -sha256 -key server.key -out server.crt -days 3650
```
* 生成一个叫kpl-ssl的ConfigMap,其包含刚才生成的两个ssl证书文件:
```shell
kubectl -n kpl create configmap kpl-ssl --from-file=./certs
```
* 注意保留`certs`里的证书文件,客户端程序也需要使用。
* 安装kpl-launcher
```shell
kubectl apply -f k8s/kpl-launcher/rbac.yaml
kubectl apply -f k8s/kpl-launcher/deployment.yaml
```
* 检查kpl-launcher服务是否已经启动
```shell
$ kubectl logs -n kpl kpl-launcher-5b9b6d74bc-swhmm
I0526 13:01:17.015731 6 main.go:82] Start in secured mode ...
I0526 13:01:17.020704 6 launcher.go:44] new launcher with the following backends:
I0526 13:01:17.020714 6 launcher.go:46] volcano: &{0xc00036b680 0xc000402940 map[]}
I0526 13:01:17.020727 6 launcher.go:46] simple_job: &{0xc00036b680 map[]}
I0526 13:01:17.020742 6 main.go:96] try to start launcher server at [::]:8000
```
#!/bin/bash
#wangtingwei
#由于各个环境该目录不一致,所以执行此脚本前需要进入对应的项目目录,这里k8s_yaml是相对路径。不能写绝对路径。
IMAGE_NAME=`cat image_list.txt` #获取最新launcher镜像完整名字
PRI_IMAGE_NAME=`echo $IMAGE_NAME |awk -F '/' '{print $NF}'` #截断只保留镜像名字和tag部分
date=`date +%F-%H-%M`
####################################################################
kubectl get ns |grep volcano-system
if [ $? -eq 0 ]; then
echo "volcano-system namespace already exit,continue operation"
else
echo "namespace not found, autocreate namespace volcano-system" && kubectl create namespace volcano-system
fi
####################################################################
sed_image_name () {
echo "拷贝yaml到临时目录,公有云环境下sed修改镜像"
sleep 2
cp -a k8s_yaml/kpl-launcher/deployment.yaml /tmp/deployment-$date.yaml
sed -i s@IMAGE_NAME@$IMAGE_NAME@g /tmp/deployment-$date.yaml
}
####################################################################
private_sed_image_name() {
cp -a k8s_yaml/volcano/volcano-development.yaml /tmp/volcano-development-$date.yaml #拷贝到临时目录,sed不修改模板文件。这样就避免了sed修改出错的问题,不需要在recover sed
cp -a k8s_yaml/kpl-launcher/deployment.yaml /tmp/deployment-$date.yaml
sed -i s@hub.kce.ksyun.com/aivc/volcanosh/vc-scheduler:latest@harbor_host/k8s/vc-scheduler:latest@g /tmp/volcano-development-$date.yaml
sed -i s@hub.kce.ksyun.com/aivc/volcanosh/vc-webhook-manager:latest@harbor_host/k8s/vc-webhook-manager:latest@g /tmp/volcano-development-$date.yaml
sed -i s@hub.kce.ksyun.com/aivc/volcanosh/vc-controller-manager:latest@harbor_host/k8s/vc-controller-manager:latest@g /tmp/volcano-development-$date.yaml
sed -i s@IMAGE_NAME@harbor_host/k8s/$PRI_IMAGE_NAME@g /tmp/deployment-$date.yaml
}
######################################################################
delete_server () {
kubectl delete -f k8s_yaml/kpl-ssl-configmap.yaml
kubectl delete -f k8s_yaml/kpl-ssl-configmap-autodl.yaml
kubectl delete -f k8s_yaml/volcano/ && sleep 2
kubectl delete -f k8s_yaml/kpl-launcher/ && sleep 2
kubectl delete secrets -n volcano-system volcano-admission-secret
}
create_server () {
kubectl apply -f k8s_yaml/kpl-ssl-configmap.yaml
kubectl apply -f k8s_yaml/kpl-ssl-configmap-autodl.yaml
kubectl apply -f k8s_yaml/kpl-launcher/rbac.yaml
kubectl apply -f k8s_yaml/volcano/ #在/tmp目录下创建volcano和kpl-launcher服务
kubectl apply -f /tmp/deployment-$date.yaml && sleep 3
}
private_create_server () {
kubectl apply -f k8s_yaml/kpl-ssl-configmap.yaml
kubectl apply -f k8s_yaml/kpl-ssl-configmap-autodl.yaml
kubectl apply -f k8s_yaml/kpl-launcher/rbac.yaml
kubectl apply -f /tmp/volcano-development-$date.yaml && sleep 3 #在/tmp目录下创建volcano和kpl-launcher服务
kubectl apply -f /tmp/deployment-$date.yaml && sleep 3
}
redeploy_all () {
delete_server
sed_image_name
create_server
}
private_deploy () {
delete_server
private_sed_image_name
private_create_server
}
case $1 in
redeploy_all)
redeploy_all
;;
private_deploy)
private_deploy
;;
*)
echo "please input (redeploy_all or private_deploy)"
esac
hub.kce.ksyun.com/aivc-kpl/kpl-launcher:launcher-854c5ac
\ No newline at end of file
......@@ -18,7 +18,7 @@ spec:
serviceAccount: kpl-launcher
containers:
- name: launcher
image: hub.kce.ksyun.com/aivc-kpl/kpl-launcher:launcher-d8597db
image: IMAGE_NAME #镜像仓库以及名字变量模板
command:
- /bin/bash
- -c
......
# kpl-launcher部署文档
作者:刘弘也
## 依赖安装
本服务包含如下依赖
* Kubernetes 1.15+,支持CRD
* volcano v0.4.0
其中volcano可以使用本项目提供的部署脚本`k8s/volcano/*.yaml`进行安装,和官方版本的差异主要是把其镜像被搬运到了金山云镜像仓库,以加速部署速度:
* 安装volcano:`kubectl apply -f k8s/volcano`
* 检查`volcano-system`命名空间及其内部的Pod是否全部成功启动,如下所示:
```shell
$ kubectl get pods -n volcano-system
NAME READY STATUS RESTARTS AGE
volcano-admission-7b498d4d56-7djqg 1/1 Running 0 4d23h
volcano-admission-init-7sqzw 0/1 Completed 0 4d23h
volcano-controllers-68d55f9444-sq4vt 1/1 Running 0 4d23h
volcano-scheduler-7cc766767b-hvbvb 1/1 Running 0 4d23h
```
## kpl-launcher安装
目前kpl-launcher服务进行部署的过程包含如下内容
* (事先准备好的)命名空间:"kpl"
* `k8s/kpl-launcher/rbac.yaml`
* ServiceAccount:"kpl-launcher"
* ClusterRole:"kpl-launcher"
* ClusterRoleBinding:"kpl-launcher"
* `k8s/kpl-launcher/deployment.yaml`
* Deployment:"kpl-launcher"
* 服务启动命令的可选参数
```shell
$ kpl_launcher --help
Usage of ./build/bin/kpl_launcher:
-address string
service listening address (default "[::]")
-port int
service listening port (default 8000)
-private-key string
private key for ssl/tls secured service
-cert-chain string
certificate chain for ssl/tls secured service
-incluster
if use incluster config
-local-config string
(optional) absolute path to the kubeconfig file (default "~/.kube/config")
```
* Service (默认为ClusterIP类型):"kpl-launcher-service"
具体部署步骤如下:
* 联系相关开发负责人(刘弘也)确认当前要部署的镜像版本,如:`hub.kce.ksyun.com/aivc-kpl/kpl-launcher:launcher-9efebf5`
* 修改`k8s/kpl-launcher/deployment.yaml`中的部署镜像:
```shell
export IMAGE_NAME=hub.kce.ksyun.com/aivc-kpl/kpl-launcher:launcher-f2d3958
sed -i "s/image: .*/image: ${IMAGE_NAME}/" k8s/kpl-launcher/deployment.yaml
```
* 创建命名空间kpl
```shell
kubectl create ns kpl
```
* 准备好ssl/tls的证书文件
* 使用openssl生成自签名证书(注意指定CN):
```shell
mkdir certs
# openssl req -newkey rsa:2048 -nodes -keyout certs/server.key -x509 -days 3650 -out certs/server.crt -subj "/CN=KPL"
openssl ecparam -genkey -name secp384r1 -out server.key
openssl req -new -x509 -sha256 -key server.key -out server.crt -days 3650
```
* 生成一个叫kpl-ssl的ConfigMap,其包含刚才生成的两个ssl证书文件:
```shell
kubectl -n kpl create configmap kpl-ssl --from-file=./certs
```
* 注意保留`certs`里的证书文件,客户端程序也需要使用。
* 安装kpl-launcher
```shell
kubectl apply -f k8s/kpl-launcher/rbac.yaml
kubectl apply -f k8s/kpl-launcher/deployment.yaml
```
* 检查kpl-launcher服务是否已经启动
```shell
$ kubectl logs -n kpl kpl-launcher-5b9b6d74bc-swhmm
I0526 13:01:17.015731 6 main.go:82] Start in secured mode ...
I0526 13:01:17.020704 6 launcher.go:44] new launcher with the following backends:
I0526 13:01:17.020714 6 launcher.go:46] volcano: &{0xc00036b680 0xc000402940 map[]}
I0526 13:01:17.020727 6 launcher.go:46] simple_job: &{0xc00036b680 map[]}
I0526 13:01:17.020742 6 main.go:96] try to start launcher server at [::]:8000
```
#!/bin/bash
#wangtingwei
#由于各个环境该目录不一致,所以执行此脚本前需要进入对应的项目目录,这里k8s_yaml是相对路径。不能写绝对路径。
IMAGE_NAME=`cat image_list.txt` #获取最新launcher镜像完整名字
PRI_IMAGE_NAME=`echo $IMAGE_NAME |awk -F '/' '{print $NF}'` #截断只保留镜像名字和tag部分
date=`date +%F-%H-%M`
####################################################################
kubectl get ns |grep volcano-system
if [ $? -eq 0 ]; then
echo "volcano-system namespace already exit,continue operation"
else
echo "namespace not found, autocreate namespace volcano-system" && kubectl create namespace volcano-system
fi
####################################################################
sed_image_name () {
echo "拷贝yaml到临时目录,公有云环境下sed修改镜像"
sleep 2
cp -a k8s_yaml/kpl-launcher/deployment.yaml /tmp/deployment-$date.yaml
sed -i s@IMAGE_NAME@$IMAGE_NAME@g /tmp/deployment-$date.yaml
}
####################################################################
private_sed_image_name() {
cp -a k8s_yaml/volcano/volcano-development.yaml /tmp/volcano-development-$date.yaml #拷贝到临时目录,sed不修改模板文件。这样就避免了sed修改出错的问题,不需要在recover sed
cp -a k8s_yaml/kpl-launcher/deployment.yaml /tmp/deployment-$date.yaml
sed -i s@hub.kce.ksyun.com/aivc/volcanosh/vc-scheduler:latest@harbor_host/k8s/vc-scheduler:latest@g /tmp/volcano-development-$date.yaml
sed -i s@hub.kce.ksyun.com/aivc/volcanosh/vc-webhook-manager:latest@harbor_host/k8s/vc-webhook-manager:latest@g /tmp/volcano-development-$date.yaml
sed -i s@hub.kce.ksyun.com/aivc/volcanosh/vc-controller-manager:latest@harbor_host/k8s/vc-controller-manager:latest@g /tmp/volcano-development-$date.yaml
sed -i s@IMAGE_NAME@harbor_host/k8s/$PRI_IMAGE_NAME@g /tmp/deployment-$date.yaml
}
######################################################################
delete_server () {
kubectl delete -f k8s_yaml/kpl-ssl-configmap.yaml
kubectl delete -f k8s_yaml/kpl-ssl-configmap-autodl.yaml
kubectl delete -f k8s_yaml/volcano/ && sleep 2
kubectl delete -f k8s_yaml/kpl-launcher/ && sleep 2
kubectl delete secrets -n volcano-system volcano-admission-secret
}
create_server () {
kubectl apply -f k8s_yaml/kpl-ssl-configmap.yaml
kubectl apply -f k8s_yaml/kpl-ssl-configmap-autodl.yaml
kubectl apply -f k8s_yaml/kpl-launcher/rbac.yaml
kubectl apply -f k8s_yaml/volcano/ #在/tmp目录下创建volcano和kpl-launcher服务
kubectl apply -f /tmp/deployment-$date.yaml && sleep 3
}
private_create_server () {
kubectl apply -f k8s_yaml/kpl-ssl-configmap.yaml
kubectl apply -f k8s_yaml/kpl-ssl-configmap-autodl.yaml
kubectl apply -f k8s_yaml/kpl-launcher/rbac.yaml
kubectl apply -f /tmp/volcano-development-$date.yaml && sleep 3 #在/tmp目录下创建volcano和kpl-launcher服务
kubectl apply -f /tmp/deployment-$date.yaml && sleep 3
}
redeploy_all () {
delete_server
sed_image_name
create_server
}
private_deploy () {
delete_server
private_sed_image_name
private_create_server
}
case $1 in
redeploy_all)
redeploy_all
;;
private_deploy)
private_deploy
;;
*)
echo "please input (redeploy_all or private_deploy)"
esac
hub.kce.ksyun.com/aivc-kpl/kpl-launcher:launcher-854c5ac
\ No newline at end of file
-----BEGIN CERTIFICATE-----
MIICKTCCAbCgAwIBAgIJAOEzff/TB45/MAoGCCqGSM49BAMCMFMxCzAJBgNVBAYT
AkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRlcm5ldCBXaWRn
aXRzIFB0eSBMdGQxDDAKBgNVBAMMA0tQTDAeFw0yMDA2MDMwMzA2MDRaFw0zMDA2
MDEwMzA2MDRaMFMxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEw
HwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQxDDAKBgNVBAMMA0tQTDB2
MBAGByqGSM49AgEGBSuBBAAiA2IABDUlvu7GTOB4kYiqBuRqiLU3chqccZhMFL16
olmMX31M8EWA0VXj5JeMo4js7NcuBRfFp2JIdhqOroodz+Bu64nmhjbr7Qkglk14
XguoUrwycTAlba2JdBpXRXwY5uP7eqNQME4wHQYDVR0OBBYEFPQ81JjaE8UG4FyX
Hjo09H9dRkcEMB8GA1UdIwQYMBaAFPQ81JjaE8UG4FyXHjo09H9dRkcEMAwGA1Ud
EwQFMAMBAf8wCgYIKoZIzj0EAwIDZwAwZAIwSCzsAdwv5fJOlAMI6W+0s5whygR3
VQEq88EffPmjQ8Cn6rqWFzev4Cd5W18Qput9AjAjoBh5WdlK1N0sIZpRLaCYK7El
2vab3X1CbV8MkwGJU7Vnjav+w185kSNpbpF6idw=
-----END CERTIFICATE-----
-----BEGIN EC PARAMETERS-----
BgUrgQQAIg==
-----END EC PARAMETERS-----
-----BEGIN EC PRIVATE KEY-----
MIGkAgEBBDCc8hpwAUrmEZUnFeD4Fi/OnMT2fAXVtJ50FIR/HCWMD/pPDV1uKLZI
Hm6h6fRQX82gBwYFK4EEACKhZANiAAQ1Jb7uxkzgeJGIqgbkaoi1N3IanHGYTBS9
eqJZjF99TPBFgNFV4+SXjKOI7OzXLgUXxadiSHYajq6KHc/gbuuJ5oY26+0JIJZN
eF4LqFK8MnEwJW2tiXQaV0V8GObj+3o=
-----END EC PRIVATE KEY-----
kind: Deployment
apiVersion: apps/v1
metadata:
name: kpl-launcher
namespace: kpl
labels:
app: kpl-launcher
spec:
replicas: 1
selector:
matchLabels:
app: kpl-launcher
template:
metadata:
labels:
app: kpl-launcher
spec:
serviceAccount: kpl-launcher
containers:
- name: launcher
image: IMAGE_NAME #镜像仓库以及名字变量模板
command:
- /bin/bash
- -c
- kpl_launcher --incluster --private-key /etc/kpl/ssl/server.key --cert-chain /etc/kpl/ssl/server.crt --port 8000 2>&1
ports:
- containerPort: 8000
name: launcher-port
imagePullPolicy: "IfNotPresent"
resources:
requests: #新增加request。降低资源调度要求
cpu: 1
memory: 100Mi
limits:
cpu: 8
memory: 100Mi
env:
- name: KPL_IMAGE_SECRET_NAME
value: kpl-regcred
volumeMounts:
- name: kpl-ssl
mountPath: /etc/kpl/ssl
readOnly: true
volumes:
- name: kpl-ssl
configMap:
name: kpl-ssl
imagePullSecrets:
- name: kpl-regcred
---
apiVersion: v1
kind: Service
metadata:
labels:
app: kpl-launcher
name: kpl-launcher-service
namespace: kpl
spec:
ports:
- port: 8000
protocol: TCP
targetPort: 8000
# type: NodePort
selector:
app: kpl-launcher
apiVersion: v1
kind: ServiceAccount
metadata:
name: kpl-launcher
namespace: kpl
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: kpl-launcher
rules:
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch.volcano.sh"]
resources: ["jobs"]
verbs: ["get", "create", "list", "watch", "update", "delete"]
- apiGroups: [""]
resources: ["pods", "pods/status"]
verbs: ["create", "get", "list", "watch", "update", "bind", "updateStatus", "delete"]
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["list", "watch"]
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["list", "watch"]
- apiGroups: [""]
resources: ["services"]
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["list", "watch"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses"]
verbs: ["list", "watch"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list", "watch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: kpl-launcher
subjects:
- kind: ServiceAccount
name: kpl-launcher
namespace: kpl
roleRef:
kind: ClusterRole
name: kpl-launcher
apiGroup: rbac.authorization.k8s.io
apiVersion: v1
data:
server.crt: |
-----BEGIN CERTIFICATE-----
MIICKTCCAbCgAwIBAgIJAOEzff/TB45/MAoGCCqGSM49BAMCMFMxCzAJBgNVBAYT
AkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRlcm5ldCBXaWRn
aXRzIFB0eSBMdGQxDDAKBgNVBAMMA0tQTDAeFw0yMDA2MDMwMzA2MDRaFw0zMDA2
MDEwMzA2MDRaMFMxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEw
HwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQxDDAKBgNVBAMMA0tQTDB2
MBAGByqGSM49AgEGBSuBBAAiA2IABDUlvu7GTOB4kYiqBuRqiLU3chqccZhMFL16
olmMX31M8EWA0VXj5JeMo4js7NcuBRfFp2JIdhqOroodz+Bu64nmhjbr7Qkglk14
XguoUrwycTAlba2JdBpXRXwY5uP7eqNQME4wHQYDVR0OBBYEFPQ81JjaE8UG4FyX
Hjo09H9dRkcEMB8GA1UdIwQYMBaAFPQ81JjaE8UG4FyXHjo09H9dRkcEMAwGA1Ud
EwQFMAMBAf8wCgYIKoZIzj0EAwIDZwAwZAIwSCzsAdwv5fJOlAMI6W+0s5whygR3
VQEq88EffPmjQ8Cn6rqWFzev4Cd5W18Qput9AjAjoBh5WdlK1N0sIZpRLaCYK7El
2vab3X1CbV8MkwGJU7Vnjav+w185kSNpbpF6idw=
-----END CERTIFICATE-----
server.key: |
-----BEGIN EC PARAMETERS-----
BgUrgQQAIg==
-----END EC PARAMETERS-----
-----BEGIN EC PRIVATE KEY-----
MIGkAgEBBDCc8hpwAUrmEZUnFeD4Fi/OnMT2fAXVtJ50FIR/HCWMD/pPDV1uKLZI
Hm6h6fRQX82gBwYFK4EEACKhZANiAAQ1Jb7uxkzgeJGIqgbkaoi1N3IanHGYTBS9
eqJZjF99TPBFgNFV4+SXjKOI7OzXLgUXxadiSHYajq6KHc/gbuuJ5oY26+0JIJZN
eF4LqFK8MnEwJW2tiXQaV0V8GObj+3o=
-----END EC PRIVATE KEY-----
kind: ConfigMap
metadata:
name: kpl-ssl
namespace: autodl
apiVersion: v1
data:
server.crt: |
-----BEGIN CERTIFICATE-----
MIICKTCCAbCgAwIBAgIJAOEzff/TB45/MAoGCCqGSM49BAMCMFMxCzAJBgNVBAYT
AkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRlcm5ldCBXaWRn
aXRzIFB0eSBMdGQxDDAKBgNVBAMMA0tQTDAeFw0yMDA2MDMwMzA2MDRaFw0zMDA2
MDEwMzA2MDRaMFMxCzAJBgNVBAYTAkFVMRMwEQYDVQQIDApTb21lLVN0YXRlMSEw
HwYDVQQKDBhJbnRlcm5ldCBXaWRnaXRzIFB0eSBMdGQxDDAKBgNVBAMMA0tQTDB2
MBAGByqGSM49AgEGBSuBBAAiA2IABDUlvu7GTOB4kYiqBuRqiLU3chqccZhMFL16
olmMX31M8EWA0VXj5JeMo4js7NcuBRfFp2JIdhqOroodz+Bu64nmhjbr7Qkglk14
XguoUrwycTAlba2JdBpXRXwY5uP7eqNQME4wHQYDVR0OBBYEFPQ81JjaE8UG4FyX
Hjo09H9dRkcEMB8GA1UdIwQYMBaAFPQ81JjaE8UG4FyXHjo09H9dRkcEMAwGA1Ud
EwQFMAMBAf8wCgYIKoZIzj0EAwIDZwAwZAIwSCzsAdwv5fJOlAMI6W+0s5whygR3
VQEq88EffPmjQ8Cn6rqWFzev4Cd5W18Qput9AjAjoBh5WdlK1N0sIZpRLaCYK7El
2vab3X1CbV8MkwGJU7Vnjav+w185kSNpbpF6idw=
-----END CERTIFICATE-----
server.key: |
-----BEGIN EC PARAMETERS-----
BgUrgQQAIg==
-----END EC PARAMETERS-----
-----BEGIN EC PRIVATE KEY-----
MIGkAgEBBDCc8hpwAUrmEZUnFeD4Fi/OnMT2fAXVtJ50FIR/HCWMD/pPDV1uKLZI
Hm6h6fRQX82gBwYFK4EEACKhZANiAAQ1Jb7uxkzgeJGIqgbkaoi1N3IanHGYTBS9
eqJZjF99TPBFgNFV4+SXjKOI7OzXLgUXxadiSHYajq6KHc/gbuuJ5oY26+0JIJZN
eF4LqFK8MnEwJW2tiXQaV0V8GObj+3o=
-----END EC PRIVATE KEY-----
kind: ConfigMap
metadata:
creationTimestamp: "2020-06-03T03:06:22Z"
name: kpl-ssl
namespace: kpl
resourceVersion: "61559587"
selfLink: /api/v1/namespaces/kpl/configmaps/kpl-ssl
uid: 4c6174b1-3847-4c29-a7e7-fcd7b6e011e9
......@@ -31,4 +31,7 @@ data:
DCGM_FI_DEV_PCIE_MAX_LINK_GEN, gauge, ningfd pcie max link gen
DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH, gauge, ningfd pcie max link width
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, ningfd pcie read bytes
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, ningfd pcie trans bytes
\ No newline at end of file
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, ningfd pcie trans bytes
DCGM_FI_DEV_FB_FREE, gauge, ningfd memory in MB
DCGM_FI_DEV_FB_TOTAL, gauge, ningfd memory in MB
DCGM_FI_DEV_FB_USED, gauge, ningfd memory in MB
\ No newline at end of file
......@@ -10,12 +10,13 @@ spec:
metadata:
labels:
app: wd-dcgm
monitor-by-prometheus: "true"
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
gpu-type-nvidia: "true"
gpu-device-type-nvidia: "true"
kpl: "true"
containers:
- name: wd-dcgm
......@@ -53,7 +54,7 @@ spec:
volumes:
- name: "pod-gpu-resources"
hostPath:
path: "/var/lib/kubelet/pod-resources"
path: "/data/kubelet/pod-resources"
- name: config
configMap:
name: wd-dcgm-config
......@@ -9,6 +9,7 @@ spec:
metadata:
labels:
app: wd-grafana
monitor-by-prometheus: "true"
spec:
imagePullSecrets:
- name: "harbor-secret"
......
......@@ -10,6 +10,7 @@ spec:
metadata:
labels:
app: wd-exporter
monitor-by-prometheus: "true"
spec:
imagePullSecrets:
- name: "harbor-secret"
......
......@@ -126,116 +126,6 @@ data:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Example scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# for all or only some endpoints.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# Example relabel to scrape only endpoints that have
# "example.io/should_be_scraped = true" annotation.
# - source_labels: [__meta_kubernetes_service_annotation_example_io_should_be_scraped]
# action: keep
# regex: true
#
# Example relabel to customize metric path based on endpoints
# "example.io/metric_path = <metric path>" annotation.
# - source_labels: [__meta_kubernetes_service_annotation_example_io_metric_path]
# action: replace
# target_label: __metrics_path__
# regex: (.+)
#
# Example relabel to scrape only single, desired port for the service based
# on endpoints "example.io/scrape_port = <port>" annotation.
# - source_labels: [__address__, __meta_kubernetes_service_annotation_example_io_scrape_port]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:$2
# target_label: __address__
#
# Example relabel to configure scrape scheme for all service scrape targets
# based on endpoints "example.io/scrape_scheme = <scheme>" annotation.
# - source_labels: [__meta_kubernetes_service_annotation_example_io_scrape_scheme]
# action: replace
# target_label: __scheme__
# regex: (https?)
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# Example scrape config for probing services via the Blackbox Exporter.
#
# The relabeling allows the actual service scrape endpoint to be configured
# for all or only some services.
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
# Example relabel to probe only some services that have "example.io/should_be_probed = true" annotation
# - source_labels: [__meta_kubernetes_service_annotation_example_io_should_be_probed]
# action: keep
# regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
# Example scrape config for probing ingresses via the Blackbox Exporter.
#
# The relabeling allows the actual ingress scrape endpoint to be configured
# for all or only some services.
- job_name: 'kubernetes-ingresses'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: ingress
relabel_configs:
# Example relabel to probe only some ingresses that have "example.io/should_be_probed = true" annotation
# - source_labels: [__meta_kubernetes_ingress_annotation_example_io_should_be_probed]
# action: keep
# regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape to be configured
......@@ -245,6 +135,9 @@ data:
kubernetes_sd_configs:
- role: pod
selectors:
- role: pod
label: "monitor-by-prometheus=true"
relabel_configs:
# Example relabel to scrape only pods that have
......@@ -267,6 +160,9 @@ data:
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:$2
# target_label: __address__
#- source_labels: [__meta_kubernetes_pod_label_app]
# action: keep
# regex: (^wd*).*
- action: replace
regex: (.*)
replacement: $1
......
......@@ -9,6 +9,7 @@ spec:
metadata:
labels:
app: wd-prometheus
monitor-by-prometheus: "true"
spec:
imagePullSecrets:
- name: "harbor-secret"
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!