Commit 7ccff579 by 王庭威

增加watch dog监控

1 parent 23f6f703
apiVersion: v1
kind: ConfigMap
metadata:
name: wd-dcgm-config
namespace: autodl
data:
dcgm-config.yml: |
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
DCGM_FI_DEV_PCIE_LINK_GEN, gauge, ningfd pcie link gen
DCGM_FI_DEV_PCIE_LINK_WIDTH, gauge, ningfd pcie link width
DCGM_FI_DEV_PCIE_MAX_LINK_GEN, gauge, ningfd pcie max link gen
DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH, gauge, ningfd pcie max link width
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, ningfd pcie read bytes
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, ningfd pcie trans bytes
\ No newline at end of file
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: wd-dcgm
namespace: autodl
spec:
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: wd-dcgm
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
gpu-type-nvidia: "true"
kpl: "true"
containers:
- name: wd-dcgm
image: hub.kce.ksyun.com/kpl_k8s/dcgm-exporter:2.1.4-2.3.0-ubuntu18.04
imagePullPolicy: Always
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
- name: "DCGM_EXPORTER_KUBERNETES"
value: "true"
volumeMounts:
- name: "pod-gpu-resources"
readOnly: true
mountPath: "/var/lib/kubelet/pod-resources"
- name: config
subPath: "dcgm-config.yml"
mountPath: "/etc/dcgm-exporter/default-counters.csv"
securityContext:
runAsNonRoot: false
runAsUser: 0
ports:
- containerPort: 9400
name: port-9400
livenessProbe:
initialDelaySeconds: 30
tcpSocket:
port: port-9400
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "3Gi"
volumes:
- name: "pod-gpu-resources"
hostPath:
path: "/var/lib/kubelet/pod-resources"
- name: config
configMap:
name: wd-dcgm-config
apiVersion: v1
kind: Service
metadata:
name: wd-dcgm
namespace: autodl
spec:
type: NodePort
selector:
app: wd-dcgm
ports:
- port: 9400
protocol: TCP
name: port-9400
targetPort: 9400
nodePort: 30702
This diff could not be displayed because it is too large.
apiVersion: v1
kind: ConfigMap
metadata:
name: wd-grafana-config
namespace: autodl
data:
grafana-dashboard-migrate.yaml: |
apiVersion: 1
providers:
- name: dashboards
type: file
updateIntervalSeconds: 360
options:
path: /etc/dashboards
foldersFromFilesStructure: true
grafana.ini: |
[server]
domain = wd-grafana.autodl.svc.cluster.local
root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
serve_from_sub_path = true
[auth.anonymous]
enabled = true
[security]
allow_embedding = true
admin_user = admin
admin_password = seetatech
grafana-data-source-migrate.yaml: |
apiVersion: 1
deleteDatasources:
- name: Prometheus
orgId: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
orgId: 1
uid: prometheus
url: http://wd-prometheus.autodl.svc.cluster.local:9090
password:
user:
database:
basicAuth:
basicAuthUser:
basicAuthPassword:
withCredentials:
isDefault:
jsonData:
secureJsonData:
version: 1
editable: false
\ No newline at end of file
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: wd-grafana
namespace: autodl
spec:
replicas: 1
template:
metadata:
labels:
app: wd-grafana
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
kpl: "true"
internal_service_node: "true"
containers:
- name: wd-grafana
image: hub.kce.ksyun.com/kpl_k8s/grafana:7.4.3-ubuntu
imagePullPolicy: Always
volumeMounts:
- name: config
subPath: grafana.ini
mountPath: /etc/grafana/grafana.ini
- name: config
subPath: grafana-dashboard-migrate.yaml
mountPath: /etc/grafana/provisioning/dashboards/grafana-dashboard-migrate.yaml
- name: config
subPath: grafana-data-source-migrate.yaml
mountPath: /etc/grafana/provisioning/datasources/grafana-data-source-migrate.yaml
- name: config-dashboard
mountPath: /etc/dashboards
ports:
- containerPort: 3000
name: port-3000
livenessProbe:
initialDelaySeconds: 120
tcpSocket:
port: port-3000
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "3Gi"
volumes:
- name: config
configMap:
name: wd-grafana-config
- name: config-dashboard
configMap:
name: wd-grafana-config-dashboard
apiVersion: v1
kind: Service
metadata:
name: wd-grafana
namespace: autodl
spec:
type: NodePort
selector:
app: wd-grafana
ports:
- port: 3000
protocol: TCP
name: port-3000
targetPort: 3000
nodePort: 30700
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: wd-node-exporter
namespace: autodl
spec:
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: wd-exporter
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
kpl: "true"
containers:
- name: wd-node-exporter
image: hub.kce.ksyun.com/kpl_k8s/prometheus_node_exporter:v1.0.0-rc.0
imagePullPolicy: Always
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker|var/lib/containers/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
volumeMounts:
- mountPath: /host/proc
name: proc
readOnly: true
- mountPath: /host/sys
name: sys
readOnly: true
- mountPath: /host/root
name: root
readOnly: true
securityContext:
runAsNonRoot: false
runAsUser: 0
ports:
- containerPort: 9100
name: port-9100
livenessProbe:
initialDelaySeconds: 30
tcpSocket:
port: port-9100
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "3Gi"
volumes:
- hostPath:
path: /proc
name: proc
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root
\ No newline at end of file
apiVersion: v1
kind: Service
metadata:
name: wd-node-exporter
namespace: autodl
spec:
type: NodePort
selector:
app: wd-node-exporter
ports:
- port: 9100
protocol: TCP
name: port-9100
targetPort: 9100
nodePort: 30703
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: wd-prometheus
namespace: autodl
spec:
replicas: 1
template:
metadata:
labels:
app: wd-prometheus
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
kpl: "true"
internal_service_node: "true"
containers:
- name: wd-prometheus
image: hub.kce.ksyun.com/kpl_k8s/prometheus:v2.17.2
imagePullPolicy: Always
volumeMounts:
- name: config
mountPath: /etc/prometheus
command:
- /bin/prometheus
args:
- --config.file=/etc/prometheus/prometheus.yml
ports:
- containerPort: 9090
name: port-9090
livenessProbe:
initialDelaySeconds: 30
tcpSocket:
port: port-9090
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "3Gi"
volumes:
- name: config
configMap:
name: wd-prometheus-config
apiVersion: v1
kind: Service
metadata:
name: wd-prometheus
namespace: autodl
spec:
type: NodePort
selector:
app: wd-prometheus
ports:
- port: 9090
protocol: TCP
name: port-9090
targetPort: 9090
nodePort: 30701
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!