目录

prometheus monitoring

{% raw %}

Prometheus Monitoring

部署 Prometheus

镜像下载

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
jicki/prometheus-operator:v0.16.1
jicki/configmap-reload:v0.0.1
jicki/kube-rbac-proxy:v0.2.0
jicki/node-exporter:v0.15.2
jicki/kube-state-metrics:v1.2.0
jicki/addon-resizer:1.0
jicki/monitoring-grafana:4.6.3
jicki/grafana-watcher:v0.0.8
quay.io/prometheus/prometheus:v2.1.0
quay.io/coreos/prometheus-config-reloader:v0.0.2

创建 namespaces

1
2
3
4
5
6
7
# vi namespace.yaml

apiVersion: v1
kind: Namespace
metadata:
  name: monitoring

1
2
3
4
5
# 导入 yaml

[root@kubernetes-64 monitoring]# kubectl apply -f namespace.yaml
namespace "monitoring" created

prometheus-operator

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# vi prometheus-operator.yaml


---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: prometheus-operator
  namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-operator
subjects:
- kind: ServiceAccount
  name: prometheus-operator
  namespace: monitoring
  
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: prometheus-operator
  namespace: monitoring
rules:
- apiGroups:
  - extensions
  resources:
  - thirdpartyresources
  verbs:
  - "*"
- apiGroups:
  - apiextensions.k8s.io
  resources:
  - customresourcedefinitions
  verbs:
  - "*"
- apiGroups:
  - monitoring.coreos.com
  resources:
  - alertmanagers
  - prometheuses
  - servicemonitors
  verbs:
  - "*"
- apiGroups:
  - apps
  resources:
  - statefulsets
  verbs: ["*"]
- apiGroups: [""]
  resources:
  - configmaps
  - secrets
  verbs: ["*"]
- apiGroups: [""]
  resources:
  - pods
  verbs: ["list", "delete"]
- apiGroups: [""]
  resources:
  - services
  - endpoints
  verbs: ["get", "create", "update"]
- apiGroups: [""]
  resources:
  - nodes
  verbs: ["list", "watch"]
- apiGroups: [""]
  resources:
  - namespaces
  verbs: ["list"]
  
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus-operator
  namespace: monitoring
  
---

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  labels:
    k8s-app: prometheus-operator
  name: prometheus-operator
  namespace: monitoring
spec:
  replicas: 1
  template:
    metadata:
      labels:
        k8s-app: prometheus-operator
    spec:
      containers:
      - args:
        - --kubelet-service=kube-system/kubelet
        - --config-reloader-image=jicki/configmap-reload:v0.0.1
        image: jicki/prometheus-operator:v0.16.1
        name: prometheus-operator
        ports:
        - containerPort: 8080
          name: http
        resources:
          limits:
            cpu: 200m
            memory: 100Mi
          requests:
            cpu: 100m
            memory: 50Mi
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534
      serviceAccountName: prometheus-operator
    
---
apiVersion: v1
kind: Service
metadata:
  name: prometheus-operator
  namespace: monitoring
  labels:
    k8s-app: prometheus-operator
spec:
  type: ClusterIP
  ports:
  - name: http
    port: 8080
    targetPort: http
    protocol: TCP
  selector:
    k8s-app: prometheus-operator


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f prometheus-operator.yaml 
clusterrolebinding "prometheus-operator" created
clusterrole "prometheus-operator" created
serviceaccount "prometheus-operator" created
deployment "prometheus-operator" created
service "prometheus-operator" created


[root@kubernetes-64 monitoring]# kubectl get pod -n monitoring
NAME                                READY     STATUS    RESTARTS   AGE
prometheus-operator-c544bf4-v6z5z   1/1       Running   0          11s

node-exporter

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
vi node-exporter.yaml


---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: node-exporter
  namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: node-exporter
subjects:
- kind: ServiceAccount
  name: node-exporter
  namespace: monitoring
  
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: node-exporter
  namespace: monitoring
rules:
- apiGroups: ["authentication.k8s.io"]
  resources:
  - tokenreviews
  verbs: ["create"]
- apiGroups: ["authorization.k8s.io"]
  resources:
  - subjectaccessreviews
  verbs: ["create"]
  
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: node-exporter
  namespace: monitoring
  
---
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: monitoring
spec:
  updateStrategy:
    rollingUpdate:
      maxUnavailable: 1
    type: RollingUpdate
  template:
    metadata:
      labels:
        app: node-exporter
      name: node-exporter
    spec:
      serviceAccountName: node-exporter
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534
      hostNetwork: true
      hostPID: true
      containers:
      - image: jicki/node-exporter:v0.15.2
        args:
        - "--web.listen-address=127.0.0.1:9101"
        - "--path.procfs=/host/proc"
        - "--path.sysfs=/host/sys"
        name: node-exporter
        resources:
          requests:
            memory: 30Mi
            cpu: 100m
          limits:
            memory: 50Mi
            cpu: 200m
        volumeMounts:
        - name: proc
          readOnly:  true
          mountPath: /host/proc
        - name: sys
          readOnly: true
          mountPath: /host/sys
      - name: kube-rbac-proxy
        image: jicki/kube-rbac-proxy:v0.2.0
        args:
        - "--secure-listen-address=:9100"
        - "--upstream=http://127.0.0.1:9101/"
        ports:
        - containerPort: 9100
          hostPort: 9100
          name: https
        resources:
          requests:
            memory: 20Mi
            cpu: 10m
          limits:
            memory: 40Mi
            cpu: 20m
      tolerations:
        - effect: NoSchedule
          operator: Exists
      volumes:
      - name: proc
        hostPath:
          path: /proc
      - name: sys
        hostPath:
          path: /sys
          
---
apiVersion: v1
kind: Service
metadata:
  labels:
    app: node-exporter
    k8s-app: node-exporter
  name: node-exporter
  namespace: monitoring
spec:
  type: ClusterIP
  clusterIP: None
  ports:
  - name: https
    port: 9100
    protocol: TCP
  selector:
    app: node-exporter


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f node-exporter.yaml 
clusterrolebinding "node-exporter" created
clusterrole "node-exporter" created
serviceaccount "node-exporter" created
daemonset "node-exporter" created
service "node-exporter" created


[root@kubernetes-64 monitoring]# kubectl get pods -n monitoring -o wide
NAME                  READY     STATUS    RESTARTS   AGE       IP            NODE
node-exporter-2299v   2/2       Running   0          2m        172.16.1.66   kubernetes-66
node-exporter-2lsfc   2/2       Running   0          2m        172.16.1.64   kubernetes-64
node-exporter-mstjl   2/2       Running   0          2m        172.16.1.65   kubernetes-65


kube-state-metrics

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
vi kube-state-metrics.yaml


---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: kube-state-metrics
  namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: monitoring
  
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: kube-state-metrics
  namespace: monitoring
rules:
- apiGroups: [""]
  resources:
  - nodes
  - pods
  - services
  - resourcequotas
  - replicationcontrollers
  - limitranges
  - persistentvolumeclaims
  - persistentvolumes
  - namespaces
  - endpoints
  verbs: ["list", "watch"]
- apiGroups: ["extensions"]
  resources:
  - daemonsets
  - deployments
  - replicasets
  verbs: ["list", "watch"]
- apiGroups: ["apps"]
  resources:
  - statefulsets
  verbs: ["list", "watch"]
- apiGroups: ["batch"]
  resources:
  - cronjobs
  - jobs
  verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
  resources:
  - horizontalpodautoscalers
  verbs: ["list", "watch"]
- apiGroups: ["authentication.k8s.io"]
  resources:
  - tokenreviews
  verbs: ["create"]
- apiGroups: ["authorization.k8s.io"]
  resources:
  - subjectaccessreviews
  verbs: ["create"]
  
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
  name: kube-state-metrics
  namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: kube-state-metrics-resizer
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
  name: kube-state-metrics-resizer
  namespace: monitoring
rules:
- apiGroups: [""]
  resources:
  - pods
  verbs: ["get"]
- apiGroups: ["extensions"]
  resources:
  - deployments
  resourceNames: ["kube-state-metrics"]
  verbs: ["get", "update"]
  
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: kube-state-metrics
  namespace: monitoring
  
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: kube-state-metrics
  namespace: monitoring
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: kube-state-metrics
    spec:
      serviceAccountName: kube-state-metrics
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534
      containers:
      - name: kube-rbac-proxy-main
        image: jicki/kube-rbac-proxy:v0.2.0
        args:
        - "--secure-listen-address=:8443"
        - "--upstream=http://127.0.0.1:8081/"
        ports:
        - name: https-main
          containerPort: 8443
        resources:
          requests:
            memory: 20Mi
            cpu: 10m
          limits:
            memory: 40Mi
            cpu: 20m
      - name: kube-rbac-proxy-self
        image: jicki/kube-rbac-proxy:v0.2.0
        args:
        - "--secure-listen-address=:9443"
        - "--upstream=http://127.0.0.1:8082/"
        ports:
        - name: https-self
          containerPort: 9443
        resources:
          requests:
            memory: 20Mi
            cpu: 10m
          limits:
            memory: 40Mi
            cpu: 20m
      - name: kube-state-metrics
        image: jicki/kube-state-metrics:v1.2.0
        args:
        - "--host=127.0.0.1"
        - "--port=8081"
        - "--telemetry-host=127.0.0.1"
        - "--telemetry-port=8082"
      - name: addon-resizer
        image: jicki/addon-resizer:1.0
        resources:
          limits:
            cpu: 100m
            memory: 30Mi
          requests:
            cpu: 100m
            memory: 30Mi
        env:
          - name: MY_POD_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          - name: MY_POD_NAMESPACE
            valueFrom:
              fieldRef:
                fieldPath: metadata.namespace
        command:
          - /pod_nanny
          - --container=kube-state-metrics
          - --cpu=100m
          - --extra-cpu=2m
          - --memory=150Mi
          - --extra-memory=30Mi
          - --threshold=5
          - --deployment=kube-state-metrics
          
---
apiVersion: v1
kind: Service
metadata:
  labels:
    app: kube-state-metrics
    k8s-app: kube-state-metrics
  name: kube-state-metrics
  namespace: monitoring
spec:
  clusterIP: None
  ports:
  - name: https-main
    port: 8443
    targetPort: https-main
    protocol: TCP
  - name: https-self
    port: 9443
    targetPort: https-self
    protocol: TCP
  selector:
    app: kube-state-metrics
    

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f kube-state-metrics.yaml 
clusterrolebinding "kube-state-metrics" created
clusterrole "kube-state-metrics" created
rolebinding "kube-state-metrics" created
role "kube-state-metrics-resizer" created
serviceaccount "kube-state-metrics" created
deployment "kube-state-metrics" created
service "kube-state-metrics" created


[root@kubernetes-64 monitoring]# kubectl get pods -n monitoring
NAME                                  READY     STATUS    RESTARTS   AGE
kube-state-metrics-578778548f-8gf8d   4/4       Running   0          1m
node-exporter-9pgx5                   2/2       Running   0          2m
node-exporter-dz9kp                   2/2       Running   0          2m
node-exporter-mcd9m                   2/2       Running   0          2m
prometheus-operator-9bf5574-s7k76     1/1       Running   0          2m


prometheus

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
vi prometheus-k8s.yaml

---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
  name: prometheus-k8s
  namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: prometheus-k8s
subjects:
- kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
  name: prometheus-k8s
  namespace: kube-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: prometheus-k8s
subjects:
- kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
  name: prometheus-k8s
  namespace: default
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: prometheus-k8s
subjects:
- kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: prometheus-k8s
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-k8s
subjects:
- kind: ServiceAccount
  name: prometheus-k8s
  namespace: monitoring

---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
  name: prometheus-k8s
  namespace: monitoring
rules:
- apiGroups: [""]
  resources:
  - nodes
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources:
  - configmaps
  verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
  name: prometheus-k8s
  namespace: kube-system
rules:
- apiGroups: [""]
  resources:
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
  name: prometheus-k8s
  namespace: default
rules:
- apiGroups: [""]
  resources:
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: prometheus-k8s
rules:
- apiGroups: [""]
  resources:
  - nodes/metrics
  verbs: ["get"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
  
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-k8s-rules
  namespace: monitoring
  labels:
    role: prometheus-rulefiles
    prometheus: k8s
data:
  alertmanager.rules.yaml: |+
    groups:
    - name: alertmanager.rules
      rules:
      - alert: AlertmanagerConfigInconsistent
        expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
          GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
          "alertmanager-$1", "alertmanager", "(.*)") != 1
        for: 5m
        labels:
          severity: critical
        annotations:
          description: The configuration of the instances of the Alertmanager cluster
            `{{$labels.service}}` are out of sync.
      - alert: AlertmanagerDownOrMissing
        expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
          "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
        for: 5m
        labels:
          severity: warning
        annotations:
          description: An unexpected number of Alertmanagers are scraped or Alertmanagers
            disappeared from discovery.
      - alert: AlertmanagerFailedReload
        expr: alertmanager_config_last_reload_successful == 0
        for: 10m
        labels:
          severity: warning
        annotations:
          description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
            }}/{{ $labels.pod}}.
  general.rules.yaml: |+
    groups:
    - name: general.rules
      rules:
      - alert: TargetDown
        expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
        for: 10m
        labels:
          severity: warning
        annotations:
          description: '{{ $value }}% of {{ $labels.job }} targets are down.'
          summary: Targets are down
      - alert: DeadMansSwitch
        expr: vector(1)
        labels:
          severity: none
        annotations:
          description: This is a DeadMansSwitch meant to ensure that the entire Alerting
            pipeline is functional.
          summary: Alerting DeadMansSwitch
      - record: fd_utilization
        expr: process_open_fds / process_max_fds
      - alert: FdExhaustionClose
        expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
        for: 10m
        labels:
          severity: warning
        annotations:
          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
            will exhaust in file/socket descriptors within the next 4 hours'
          summary: file descriptors soon exhausted
      - alert: FdExhaustionClose
        expr: predict_linear(fd_utilization[10m], 3600) > 1
        for: 10m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
            will exhaust in file/socket descriptors within the next hour'
          summary: file descriptors soon exhausted
  kube-state-metrics.rules.yaml: |+
    groups:
    - name: kube-state-metrics.rules
      rules:
      - alert: DeploymentGenerationMismatch
        expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
        for: 15m
        labels:
          severity: warning
        annotations:
          description: Observed deployment generation does not match expected one for
            deployment {{$labels.namespaces}}/{{$labels.deployment}}
          summary: Deployment is outdated
      - alert: DeploymentReplicasNotUpdated
        expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
          or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
          unless (kube_deployment_spec_paused == 1)
        for: 15m
        labels:
          severity: warning
        annotations:
          description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
          summary: Deployment replicas are outdated
      - alert: DaemonSetRolloutStuck
        expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
          * 100 < 100
        for: 15m
        labels:
          severity: warning
        annotations:
          description: Only {{$value}}% of desired pods scheduled and ready for daemon
            set {{$labels.namespaces}}/{{$labels.daemonset}}
          summary: DaemonSet is missing pods
      - alert: K8SDaemonSetsNotScheduled
        expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
          > 0
        for: 10m
        labels:
          severity: warning
        annotations:
          description: A number of daemonsets are not scheduled.
          summary: Daemonsets are not scheduled correctly
      - alert: DaemonSetsMissScheduled
        expr: kube_daemonset_status_number_misscheduled > 0
        for: 10m
        labels:
          severity: warning
        annotations:
          description: A number of daemonsets are running where they are not supposed
            to run.
          summary: Daemonsets are not scheduled correctly
      - alert: PodFrequentlyRestarting
        expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
        for: 10m
        labels:
          severity: warning
        annotations:
          description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
            times within the last hour
          summary: Pod is restarting frequently
  kubelet.rules.yaml: |+
    groups:
    - name: kubelet.rules
      rules:
      - alert: K8SNodeNotReady
        expr: kube_node_status_condition{condition="Ready",status="true"} == 0
        for: 1h
        labels:
          severity: warning
        annotations:
          description: The Kubelet on {{ $labels.node }} has not checked in with the API,
            or has set itself to NotReady, for more than an hour
          summary: Node status is NotReady
      - alert: K8SManyNodesNotReady
        expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
          > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
          0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
        for: 1m
        labels:
          severity: critical
        annotations:
          description: '{{ $value }}% of Kubernetes nodes are not ready'
      - alert: K8SKubeletDown
        expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
        for: 1h
        labels:
          severity: warning
        annotations:
          description: Prometheus failed to scrape {{ $value }}% of kubelets.
      - alert: K8SKubeletDown
        expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
          * 100 > 1
        for: 1h
        labels:
          severity: critical
        annotations:
          description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
            have disappeared from service discovery.
          summary: Many Kubelets cannot be scraped
      - alert: K8SKubeletTooManyPods
        expr: kubelet_running_pod_count > 100
        for: 10m
        labels:
          severity: warning
        annotations:
          description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
            to the limit of 110
          summary: Kubelet is close to pod limit
  kubernetes.rules.yaml: |+
    groups:
    - name: kubernetes.rules
      rules:
      - record: pod_name:container_memory_usage_bytes:sum
        expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
          (pod_name)
      - record: pod_name:container_spec_cpu_shares:sum
        expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
      - record: pod_name:container_cpu_usage:sum
        expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
          BY (pod_name)
      - record: pod_name:container_fs_usage_bytes:sum
        expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
      - record: namespace:container_memory_usage_bytes:sum
        expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
      - record: namespace:container_spec_cpu_shares:sum
        expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
      - record: namespace:container_cpu_usage:sum
        expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
          BY (namespace)
      - record: cluster:memory_usage:ratio
        expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
          (cluster) / sum(machine_memory_bytes) BY (cluster)
      - record: cluster:container_spec_cpu_shares:ratio
        expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
          / sum(machine_cpu_cores)
      - record: cluster:container_cpu_usage:ratio
        expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
          / sum(machine_cpu_cores)
      - record: apiserver_latency_seconds:quantile
        expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
          1e+06
        labels:
          quantile: "0.99"
      - record: apiserver_latency:quantile_seconds
        expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
          1e+06
        labels:
          quantile: "0.9"
      - record: apiserver_latency_seconds:quantile
        expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
          1e+06
        labels:
          quantile: "0.5"
      - alert: APIServerLatencyHigh
        expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
          > 1
        for: 10m
        labels:
          severity: warning
        annotations:
          description: the API server has a 99th percentile latency of {{ $value }} seconds
            for {{$labels.verb}} {{$labels.resource}}
      - alert: APIServerLatencyHigh
        expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
          > 4
        for: 10m
        labels:
          severity: critical
        annotations:
          description: the API server has a 99th percentile latency of {{ $value }} seconds
            for {{$labels.verb}} {{$labels.resource}}
      - alert: APIServerErrorsHigh
        expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
          * 100 > 2
        for: 10m
        labels:
          severity: warning
        annotations:
          description: API server returns errors for {{ $value }}% of requests
      - alert: APIServerErrorsHigh
        expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
          * 100 > 5
        for: 10m
        labels:
          severity: critical
        annotations:
          description: API server returns errors for {{ $value }}% of requests
      - alert: K8SApiserverDown
        expr: absent(up{job="apiserver"} == 1)
        for: 20m
        labels:
          severity: critical
        annotations:
          description: No API servers are reachable or all have disappeared from service
            discovery
    
      - alert: K8sCertificateExpirationNotice
        labels:
          severity: warning
        annotations:
          description: Kubernetes API Certificate is expiring soon (less than 7 days)
        expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
    
      - alert: K8sCertificateExpirationNotice
        labels:
          severity: critical
        annotations:
          description: Kubernetes API Certificate is expiring in less than 1 day
        expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
  node.rules.yaml: |+
    groups:
    - name: node.rules
      rules:
      - record: instance:node_cpu:rate:sum
        expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
          BY (instance)
      - record: instance:node_filesystem_usage:sum
        expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
          BY (instance)
      - record: instance:node_network_receive_bytes:rate:sum
        expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
      - record: instance:node_network_transmit_bytes:rate:sum
        expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
      - record: instance:node_cpu:ratio
        expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
          GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
      - record: cluster:node_cpu:sum_rate5m
        expr: sum(rate(node_cpu{mode!="idle"}[5m]))
      - record: cluster:node_cpu:ratio
        expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
      - alert: NodeExporterDown
        expr: absent(up{job="node-exporter"} == 1)
        for: 10m
        labels:
          severity: warning
        annotations:
          description: Prometheus could not scrape a node-exporter for more than 10m,
            or node-exporters have disappeared from discovery
      - alert: NodeDiskRunningFull
        expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
        for: 30m
        labels:
          severity: warning
        annotations:
          description: device {{$labels.device}} on node {{$labels.instance}} is running
            full within the next 24 hours (mounted at {{$labels.mountpoint}})
      - alert: NodeDiskRunningFull
        expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
        for: 10m
        labels:
          severity: critical
        annotations:
          description: device {{$labels.device}} on node {{$labels.instance}} is running
            full within the next 2 hours (mounted at {{$labels.mountpoint}})
  prometheus.rules.yaml: |+
    groups:
    - name: prometheus.rules
      rules:
      - alert: PrometheusConfigReloadFailed
        expr: prometheus_config_last_reload_successful == 0
        for: 10m
        labels:
          severity: warning
        annotations:
          description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
      - alert: PrometheusNotificationQueueRunningFull
        expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
        for: 10m
        labels:
          severity: warning
        annotations:
          description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
            $labels.pod}}
      - alert: PrometheusErrorSendingAlerts
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
          > 0.01
        for: 10m
        labels:
          severity: warning
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
      - alert: PrometheusErrorSendingAlerts
        expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
          > 0.03
        for: 10m
        labels:
          severity: critical
        annotations:
          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
      - alert: PrometheusNotConnectedToAlertmanagers
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 10m
        labels:
          severity: warning
        annotations:
          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
            to any Alertmanagers
      - alert: PrometheusTSDBReloadsFailing
        expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
        for: 12h
        labels:
          severity: warning
        annotations:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            reload failures over the last four hours.'
          summary: Prometheus has issues reloading data blocks from disk
      - alert: PrometheusTSDBCompactionsFailing
        expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
        for: 12h
        labels:
          severity: warning
        annotations:
          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
            compaction failures over the last four hours.'
          summary: Prometheus has issues compacting sample blocks
      - alert: PrometheusTSDBWALCorruptions
        expr: tsdb_wal_corruptions_total > 0
        for: 4h
        labels:
          severity: warning
        annotations:
          description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
            log (WAL).'
          summary: Prometheus write-ahead log is corrupted


---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus-k8s
  namespace: monitoring
  
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
  name: k8s
  namespace: monitoring
  labels:
    prometheus: k8s
spec:
  replicas: 2
  version: v2.1.0
  serviceAccountName: prometheus-k8s
  serviceMonitorSelector:
    matchExpressions:
    - {key: k8s-app, operator: Exists}
  ruleSelector:
    matchLabels:
      role: prometheus-rulefiles
      prometheus: k8s
  resources:
    requests:
      memory: 1024Mi
  alerting:
    alertmanagers:
    - namespace: monitoring
      name: alertmanager-main
      port: web
      
---
apiVersion: v1
kind: Service
metadata:
  labels:
    prometheus: k8s
  name: prometheus-k8s
  namespace: monitoring
spec:
  ports:
  - name: web
    port: 9090
    protocol: TCP
    targetPort: web
  selector:
    prometheus: k8s
  
---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
  name: prometheus-ingress
  namespace: monitoring
spec:
  rules:
  - host: prometheus.jicki.cn
    http:
      paths:
      - backend:
          serviceName: prometheus-k8s
          servicePort: 9090



 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f prometheus-k8s.yaml 
rolebinding "prometheus-k8s" created
rolebinding "prometheus-k8s" created
rolebinding "prometheus-k8s" created
clusterrolebinding "prometheus-k8s" created
role "prometheus-k8s" created
role "prometheus-k8s" created
role "prometheus-k8s" created
clusterrole "prometheus-k8s" created
configmap "prometheus-k8s-rules" created
serviceaccount "prometheus-k8s" created
prometheus "k8s" created
service "prometheus-k8s" created
ingress "prometheus-ingress" created


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# 查看服务

[root@kubernetes-64 monitoring]# kubectl get pods -n monitoring
NAME                                  READY     STATUS    RESTARTS   AGE
grafana-f8db675bc-5qg9j               2/2       Running   0          2h
kube-state-metrics-578778548f-8gf8d   4/4       Running   0          3h
node-exporter-9pgx5                   2/2       Running   0          3h
node-exporter-dz9kp                   2/2       Running   0          3h
node-exporter-mcd9m                   2/2       Running   0          3h
prometheus-k8s-0                      2/2       Running   0          9m
prometheus-k8s-1                      2/2       Running   0          9m
prometheus-operator-9bf5574-s7k76     1/1       Running   0          3h


  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# 添加 监控 设置

vi  prometheus-k8s-service-monitor.yaml

---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: kube-state-metrics
  namespace: monitoring
  labels:
    k8s-app: kube-state-metrics
spec:
  jobLabel: k8s-app
  selector:
    matchLabels:
      k8s-app: kube-state-metrics
  namespaceSelector:
    matchNames:
    - monitoring
  endpoints:
  - port: https-main
    scheme: https
    interval: 30s
    honorLabels: true
    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
    tlsConfig:
      insecureSkipVerify: true
  - port: https-self
    scheme: https
    interval: 30s
    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
    tlsConfig:
      insecureSkipVerify: true

---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: node-exporter
  namespace: monitoring
  labels:
    k8s-app: node-exporter
spec:
  jobLabel: k8s-app
  selector:
    matchLabels:
      k8s-app: node-exporter
  namespaceSelector:
    matchNames:
    - monitoring
  endpoints:
  - port: https
    scheme: https
    interval: 30s
    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
    tlsConfig:
      insecureSkipVerify: true
      
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: prometheus-operator
  namespace: monitoring
  labels:
    k8s-app: prometheus-operator
spec:
  endpoints:
  - port: http
  selector:
    matchLabels:
      k8s-app: prometheus-operator
      
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: prometheus
  namespace: monitoring
  labels:
    k8s-app: prometheus
spec:
  selector:
    matchLabels:
      prometheus: k8s
  namespaceSelector:
    matchNames:
    - monitoring
  endpoints:
  - port: web
    interval: 30s
    
    
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: alertmanager
  namespace: monitoring
  labels:
    k8s-app: alertmanager
spec:
  selector:
    matchLabels:
      alertmanager: main
  namespaceSelector:
    matchNames:
    - monitoring
  endpoints:
  - port: web
    interval: 30s

---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: kube-apiserver
  namespace: monitoring
  labels:
    k8s-app: apiserver
spec:
  jobLabel: component
  selector:
    matchLabels:
      component: apiserver
      provider: kubernetes
  namespaceSelector:
    matchNames:
    - default
  endpoints:
  - port: https
    interval: 30s
    scheme: https
    tlsConfig:
      caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      serverName: kubernetes
    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
    
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: kubelet
  namespace: monitoring
  labels:
    k8s-app: kubelet
spec:
  jobLabel: k8s-app
  endpoints:
  - port: https-metrics
    scheme: https
    interval: 30s
    tlsConfig:
      insecureSkipVerify: true
    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
  - port: https-metrics
    scheme: https
    path: /metrics/cadvisor
    interval: 30s
    honorLabels: true
    tlsConfig:
      insecureSkipVerify: true
    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
  selector:
    matchLabels:
      k8s-app: kubelet
  namespaceSelector:
    matchNames:
    - kube-system
    

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f prometheus-k8s-service-monitor.yaml 
servicemonitor "kube-state-metrics" created
servicemonitor "node-exporter" created
servicemonitor "prometheus-operator" created
servicemonitor "prometheus" created
servicemonitor "alertmanager" created
servicemonitor "kube-apiserver" created
servicemonitor "kubelet" created



# 查看服务

[root@kubernetes-64 monitoring]# kubectl get servicemonitor -n monitoring
NAME                      AGE
alertmanager              23s
kube-apiserver            23s
kube-controller-manager   23s
kubelet                   23s
node-exporter             23s
prometheus                23s
prometheus-operator       23s

alertmanager

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# 配置认证

vi alertmanager-secret.yaml


apiVersion: v1
kind: Secret
metadata:
  name: alertmanager-main
  namespace: monitoring
data:
  alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f alertmanager-secret.yaml 
secret "alertmanager-main" created


# 查看

[root@kubernetes-64 monitoring]# kubectl get secret -n monitoring
NAME                              TYPE                                  DATA      AGE
alertmanager-main                 Opaque                                1         28s
default-token-t6cdn               kubernetes.io/service-account-token   3         4h
grafana-credentials               Opaque                                2         3h
kube-state-metrics-token-ctnl6    kubernetes.io/service-account-token   3         3h
node-exporter-token-nflwn         kubernetes.io/service-account-token   3         3h
prometheus-k8s                    Opaque                                2         21m
prometheus-k8s-token-2dt66        kubernetes.io/service-account-token   3         21m
prometheus-operator-token-k5m47   kubernetes.io/service-account-token   3         3h

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
vi alertmanager.yaml


---
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
  name: main
  namespace: monitoring
  labels:
    alertmanager: main
spec:
  replicas: 3
  version: v0.13.0
  
---
apiVersion: v1
kind: Service
metadata:
  labels:
    alertmanager: main
  name: alertmanager-main
  namespace: monitoring
spec:
  ports:
  - name: web
    port: 9093
    protocol: TCP
    targetPort: web
  selector:
    alertmanager: main

---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
  name: alertmanager-ingress
  namespace: monitoring
spec:
  rules:
  - host: alertmanager.jicki.cn
    http:
      paths:
      - backend:
          serviceName: alertmanager-main
          servicePort: 9093


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f alertmanager.yaml 
alertmanager "main" created
service "alertmanager-main" created
ingress "alertmanager-ingress" created



# 查看服务

[root@kubernetes-64 monitoring]# kubectl get pods -n monitoring
NAME                                  READY     STATUS    RESTARTS   AGE
alertmanager-main-0                   2/2       Running   0          2m
alertmanager-main-1                   2/2       Running   0          1m
alertmanager-main-2                   2/2       Running   0          11s
grafana-f8db675bc-5qg9j               2/2       Running   0          3h
kube-state-metrics-578778548f-8gf8d   4/4       Running   0          3h
node-exporter-9pgx5                   2/2       Running   0          3h
node-exporter-dz9kp                   2/2       Running   0          3h
node-exporter-mcd9m                   2/2       Running   0          3h
prometheus-k8s-0                      2/2       Running   0          27m
prometheus-k8s-1                      2/2       Running   0          27m
prometheus-operator-9bf5574-s7k76     1/1       Running   0          3h


grafana

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
# 创建 认证 (加密方式为 base, YWRtaW4= 等于 admin)

vi grafana-credentials.yaml

apiVersion: v1
kind: Secret
metadata:
  name: grafana-credentials
  namespace: monitoring
data:
  user: YWRtaW4=
  password: YWRtaW4=

1
2
3
4
5
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f grafana-credentials.yaml 
secret "grafana-credentials" created

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
# 下载 dashboard yaml 文件

wget https://raw.githubusercontent.com/jicki/kuberneres/master/grafana-dashboards.yaml



# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f grafana-dashboards.yaml 
configmap "grafana-dashboards-0" created


  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# vi grafana.yaml

---
apiVersion: apps/v1beta1
kind: Deployment
metadata:
  name: grafana
  namespace: monitoring
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: grafana
    spec:
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534
      containers:
      - name: grafana
        image: jicki/monitoring-grafana:4.6.3-non-root
        env:
        - name: GF_AUTH_BASIC_ENABLED
          value: "true"
        - name: GF_AUTH_ANONYMOUS_ENABLED
          value: "true"
        - name: GF_SECURITY_ADMIN_USER
          valueFrom:
            secretKeyRef:
              name: grafana-credentials
              key: user
        - name: GF_SECURITY_ADMIN_PASSWORD
          valueFrom:
            secretKeyRef:
              name: grafana-credentials
              key: password
        volumeMounts:
        - name: grafana-storage
          mountPath: /data
        ports:
        - name: web
          containerPort: 3000
        resources:
          requests:
            memory: 100Mi
            cpu: 100m
          limits:
            memory: 200Mi
            cpu: 200m
      - name: grafana-watcher
        image: jicki/grafana-watcher:v0.0.8
        args:
          - '--watch-dir=/var/grafana-dashboards-0'
          - '--grafana-url=http://localhost:3000'
        env:
        - name: GRAFANA_USER
          valueFrom:
            secretKeyRef:
              name: grafana-credentials
              key: user
        - name: GRAFANA_PASSWORD
          valueFrom:
            secretKeyRef:
              name: grafana-credentials
              key: password
        resources:
          requests:
            memory: "16Mi"
            cpu: "50m"
          limits:
            memory: "32Mi"
            cpu: "100m"
        volumeMounts:
        - name: grafana-dashboards-0
          mountPath: /var/grafana-dashboards-0
      volumes:
      - name: grafana-storage
        emptyDir: {}
      - name: grafana-dashboards-0
        configMap:
          name: grafana-dashboards-0
          
---
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: monitoring
  labels:
    app: grafana
spec:
  ports:
  - port: 3000
    protocol: TCP
    targetPort: web
  selector:
    app: grafana
    
---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
  name: grafana-ingress
  namespace: monitoring
spec:
  rules:
  - host: grafana.jicki.cn
    http:
      paths:
      - backend:
          serviceName: grafana
          servicePort: 3000



 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 导入 yaml 文件

[root@kubernetes-64 monitoring]# kubectl apply -f grafana.yaml 
deployment "grafana" created
service "grafana" created
ingress "grafana-ingress" created






# 查看

[root@kubernetes-64 monitoring]# kubectl get pods -n monitoring
NAME                                  READY     STATUS    RESTARTS   AGE
grafana-f8db675bc-5qg9j               2/2       Running   0          2h
kube-state-metrics-578778548f-8gf8d   4/4       Running   0          2h
node-exporter-9pgx5                   2/2       Running   0          2h
node-exporter-dz9kp                   2/2       Running   0          2h
node-exporter-mcd9m                   2/2       Running   0          2h
prometheus-operator-9bf5574-s7k76     1/1       Running   0          2h


页面展示

https://jicki.cn/img/posts/grafana/grafana1.png https://jicki.cn/img/posts/grafana/grafana2.png https://jicki.cn/img/posts/grafana/grafana3.png https://jicki.cn/img/posts/grafana/grafana4.png https://jicki.cn/img/posts/grafana/grafana5.png {% endraw %}