Commit 7ccff579 by 王庭威

增加watch dog监控

1 parent 23f6f703
apiVersion: v1
kind: ConfigMap
metadata:
name: wd-dcgm-config
namespace: autodl
data:
dcgm-config.yml: |
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
DCGM_FI_DEV_PCIE_LINK_GEN, gauge, ningfd pcie link gen
DCGM_FI_DEV_PCIE_LINK_WIDTH, gauge, ningfd pcie link width
DCGM_FI_DEV_PCIE_MAX_LINK_GEN, gauge, ningfd pcie max link gen
DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH, gauge, ningfd pcie max link width
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, ningfd pcie read bytes
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, ningfd pcie trans bytes
\ No newline at end of file
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: wd-dcgm
namespace: autodl
spec:
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: wd-dcgm
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
gpu-type-nvidia: "true"
kpl: "true"
containers:
- name: wd-dcgm
image: hub.kce.ksyun.com/kpl_k8s/dcgm-exporter:2.1.4-2.3.0-ubuntu18.04
imagePullPolicy: Always
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
- name: "DCGM_EXPORTER_KUBERNETES"
value: "true"
volumeMounts:
- name: "pod-gpu-resources"
readOnly: true
mountPath: "/var/lib/kubelet/pod-resources"
- name: config
subPath: "dcgm-config.yml"
mountPath: "/etc/dcgm-exporter/default-counters.csv"
securityContext:
runAsNonRoot: false
runAsUser: 0
ports:
- containerPort: 9400
name: port-9400
livenessProbe:
initialDelaySeconds: 30
tcpSocket:
port: port-9400
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "3Gi"
volumes:
- name: "pod-gpu-resources"
hostPath:
path: "/var/lib/kubelet/pod-resources"
- name: config
configMap:
name: wd-dcgm-config
apiVersion: v1
kind: Service
metadata:
name: wd-dcgm
namespace: autodl
spec:
type: NodePort
selector:
app: wd-dcgm
ports:
- port: 9400
protocol: TCP
name: port-9400
targetPort: 9400
nodePort: 30702
This diff could not be displayed because it is too large.
apiVersion: v1
kind: ConfigMap
metadata:
name: wd-grafana-config
namespace: autodl
data:
grafana-dashboard-migrate.yaml: |
apiVersion: 1
providers:
- name: dashboards
type: file
updateIntervalSeconds: 360
options:
path: /etc/dashboards
foldersFromFilesStructure: true
grafana.ini: |
[server]
domain = wd-grafana.autodl.svc.cluster.local
root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
serve_from_sub_path = true
[auth.anonymous]
enabled = true
[security]
allow_embedding = true
admin_user = admin
admin_password = seetatech
grafana-data-source-migrate.yaml: |
apiVersion: 1
deleteDatasources:
- name: Prometheus
orgId: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
orgId: 1
uid: prometheus
url: http://wd-prometheus.autodl.svc.cluster.local:9090
password:
user:
database:
basicAuth:
basicAuthUser:
basicAuthPassword:
withCredentials:
isDefault:
jsonData:
secureJsonData:
version: 1
editable: false
\ No newline at end of file
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: wd-grafana
namespace: autodl
spec:
replicas: 1
template:
metadata:
labels:
app: wd-grafana
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
kpl: "true"
internal_service_node: "true"
containers:
- name: wd-grafana
image: hub.kce.ksyun.com/kpl_k8s/grafana:7.4.3-ubuntu
imagePullPolicy: Always
volumeMounts:
- name: config
subPath: grafana.ini
mountPath: /etc/grafana/grafana.ini
- name: config
subPath: grafana-dashboard-migrate.yaml
mountPath: /etc/grafana/provisioning/dashboards/grafana-dashboard-migrate.yaml
- name: config
subPath: grafana-data-source-migrate.yaml
mountPath: /etc/grafana/provisioning/datasources/grafana-data-source-migrate.yaml
- name: config-dashboard
mountPath: /etc/dashboards
ports:
- containerPort: 3000
name: port-3000
livenessProbe:
initialDelaySeconds: 120
tcpSocket:
port: port-3000
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "3Gi"
volumes:
- name: config
configMap:
name: wd-grafana-config
- name: config-dashboard
configMap:
name: wd-grafana-config-dashboard
apiVersion: v1
kind: Service
metadata:
name: wd-grafana
namespace: autodl
spec:
type: NodePort
selector:
app: wd-grafana
ports:
- port: 3000
protocol: TCP
name: port-3000
targetPort: 3000
nodePort: 30700
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: wd-node-exporter
namespace: autodl
spec:
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: wd-exporter
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
kpl: "true"
containers:
- name: wd-node-exporter
image: hub.kce.ksyun.com/kpl_k8s/prometheus_node_exporter:v1.0.0-rc.0
imagePullPolicy: Always
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker|var/lib/containers/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
volumeMounts:
- mountPath: /host/proc
name: proc
readOnly: true
- mountPath: /host/sys
name: sys
readOnly: true
- mountPath: /host/root
name: root
readOnly: true
securityContext:
runAsNonRoot: false
runAsUser: 0
ports:
- containerPort: 9100
name: port-9100
livenessProbe:
initialDelaySeconds: 30
tcpSocket:
port: port-9100
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "3Gi"
volumes:
- hostPath:
path: /proc
name: proc
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root
\ No newline at end of file
apiVersion: v1
kind: Service
metadata:
name: wd-node-exporter
namespace: autodl
spec:
type: NodePort
selector:
app: wd-node-exporter
ports:
- port: 9100
protocol: TCP
name: port-9100
targetPort: 9100
nodePort: 30703
apiVersion: v1
kind: ConfigMap
metadata:
name: wd-prometheus-config
namespace: autodl
data:
prometheus.yml: |
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Scrape config for nodes (kubelet).
#
# Rather than connecting directly to the node, the scrape is proxied though the
# Kubernetes apiserver. This means it will work if Prometheus is running out of
# cluster, or can't connect to nodes for some other reason (e.g. because of
# firewalling).
- job_name: 'kubernetes-nodes'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Scrape config for Kubelet cAdvisor.
#
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
# (those whose names begin with 'container_') have been removed from the
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
# retrieve those metrics.
#
# In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
# HTTP endpoint; use the "/metrics" endpoint on the 4194 port of nodes. In
# that case (and ensure cAdvisor's HTTP server hasn't been disabled with the
# --cadvisor-port=0 Kubelet flag).
#
# This job is not necessary and should be removed in Kubernetes 1.6 and
# earlier versions, or it will cause the metrics to be scraped twice.
- job_name: 'kubernetes-cadvisor'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# Starting Kubernetes 1.7.3 the cAdvisor metrics are under /metrics/cadvisor.
# Kubernetes CIS Benchmark recommends against enabling the insecure HTTP
# servers of Kubernetes, therefore the cAdvisor metrics on the secure handler
# are used.
metrics_path: /metrics/cadvisor
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Example scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# for all or only some endpoints.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# Example relabel to scrape only endpoints that have
# "example.io/should_be_scraped = true" annotation.
# - source_labels: [__meta_kubernetes_service_annotation_example_io_should_be_scraped]
# action: keep
# regex: true
#
# Example relabel to customize metric path based on endpoints
# "example.io/metric_path = <metric path>" annotation.
# - source_labels: [__meta_kubernetes_service_annotation_example_io_metric_path]
# action: replace
# target_label: __metrics_path__
# regex: (.+)
#
# Example relabel to scrape only single, desired port for the service based
# on endpoints "example.io/scrape_port = <port>" annotation.
# - source_labels: [__address__, __meta_kubernetes_service_annotation_example_io_scrape_port]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:$2
# target_label: __address__
#
# Example relabel to configure scrape scheme for all service scrape targets
# based on endpoints "example.io/scrape_scheme = <scheme>" annotation.
# - source_labels: [__meta_kubernetes_service_annotation_example_io_scrape_scheme]
# action: replace
# target_label: __scheme__
# regex: (https?)
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# Example scrape config for probing services via the Blackbox Exporter.
#
# The relabeling allows the actual service scrape endpoint to be configured
# for all or only some services.
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
# Example relabel to probe only some services that have "example.io/should_be_probed = true" annotation
# - source_labels: [__meta_kubernetes_service_annotation_example_io_should_be_probed]
# action: keep
# regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
# Example scrape config for probing ingresses via the Blackbox Exporter.
#
# The relabeling allows the actual ingress scrape endpoint to be configured
# for all or only some services.
- job_name: 'kubernetes-ingresses'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: ingress
relabel_configs:
# Example relabel to probe only some ingresses that have "example.io/should_be_probed = true" annotation
# - source_labels: [__meta_kubernetes_ingress_annotation_example_io_should_be_probed]
# action: keep
# regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape to be configured
# for all the declared ports (or port-free target if none is declared)
# or only some ports.
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
# Example relabel to scrape only pods that have
# "example.io/should_be_scraped = true" annotation.
# - source_labels: [__meta_kubernetes_pod_annotation_example_io_should_be_scraped]
# action: keep
# regex: true
#
# Example relabel to customize metric path based on pod
# "example.io/metric_path = <metric path>" annotation.
# - source_labels: [__meta_kubernetes_pod_annotation_example_io_metric_path]
# action: replace
# target_label: __metrics_path__
# regex: (.+)
#
# Example relabel to scrape only single, desired port for the pod
# based on pod "example.io/scrape_port = <port>" annotation.
# - source_labels: [__address__, __meta_kubernetes_pod_annotation_example_io_scrape_port]
# action: replace
# regex: ([^:]+)(?::\d+)?;(\d+)
# replacement: $1:$2
# target_label: __address__
- action: replace
regex: (.*)
replacement: $1
source_labels: [ __meta_kubernetes_pod_node_name]
target_label: instance_node_name
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
\ No newline at end of file
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: wd-prometheus
namespace: autodl
spec:
replicas: 1
template:
metadata:
labels:
app: wd-prometheus
spec:
imagePullSecrets:
- name: "harbor-secret"
serviceAccountName: autodl-serviceaccount
nodeSelector:
kpl: "true"
internal_service_node: "true"
containers:
- name: wd-prometheus
image: hub.kce.ksyun.com/kpl_k8s/prometheus:v2.17.2
imagePullPolicy: Always
volumeMounts:
- name: config
mountPath: /etc/prometheus
command:
- /bin/prometheus
args:
- --config.file=/etc/prometheus/prometheus.yml
ports:
- containerPort: 9090
name: port-9090
livenessProbe:
initialDelaySeconds: 30
tcpSocket:
port: port-9090
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2000m"
memory: "3Gi"
volumes:
- name: config
configMap:
name: wd-prometheus-config
apiVersion: v1
kind: Service
metadata:
name: wd-prometheus
namespace: autodl
spec:
type: NodePort
selector:
app: wd-prometheus
ports:
- port: 9090
protocol: TCP
name: port-9090
targetPort: 9090
nodePort: 30701
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!