一句话总结
微服务监控体系分三层:指标监控(Prometheus + Grafana,CPU/内存/QPS/RT)、健康检查(Spring Boot Actuator,/health 端点)、日志聚合(ELK:Elasticsearch + Logstash + Kibana)。Spring Boot Admin 是可视化监控平台,集成 Actuator 端点展示服务状态。核心指标:QPS(每秒请求数)、RT(响应时间)、错误率、CPU/内存使用率、JVM GC。告警体系:Prometheus AlertManager → 钉钉/邮件/短信。
初级理解
Spring Boot Actuator
# Actuator 端点
# 引入依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
# 配置暴露端点
management:
endpoints:
web:
exposure:
include: health,info,metrics,prometheus
endpoint:
health:
show-details: always
# 常用端点
# /actuator/health → 健康检查
# /actuator/info → 应用信息
# /actuator/metrics → 指标列表
# /actuator/prometheus → Prometheus 格式指标
# /actuator/env → 环境变量
# /actuator/loggers → 日志级别管理
中级深入
Prometheus + Grafana
# Prometheus:指标采集和存储
# Grafana:可视化仪表盘
# 1. 引入 Micrometer Prometheus 依赖
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
# 2. Prometheus 配置(prometheus.yml)
scrape_configs:
- job_name: 'spring-boot'
metrics_path: '/actuator/prometheus'
static_configs:
- targets: ['localhost:8080', 'localhost:8081']
# 3. Grafana 导入 Spring Boot Dashboard
# Dashboard ID: 10280(Spring Boot 2.x Statistics)
# 核心指标
# jvm_memory_used_bytes → JVM 内存使用
# jvm_gc_pause_seconds → GC 暂停时间
# http_server_requests_seconds → HTTP 请求耗时
# process_cpu_usage → CPU 使用率
Spring Boot Admin
# Spring Boot Admin:可视化监控平台
# 1. 创建 Admin Server
@SpringBootApplication
@EnableAdminServer
public class AdminServerApplication {
public static void main(String[] args) {
SpringApplication.run(AdminServerApplication.class, args);
}
}
# 2. 客户端注册到 Admin Server
spring:
boot:
admin:
client:
url: http://localhost:9090 # Admin Server 地址
# Admin Server 功能
# 查看所有服务实例状态(UP/DOWN)
# 查看详细信息(JVM、线程、内存、GC)
# 查看日志、环境变量
# 动态修改日志级别
高级拓展
ELK 日志聚合
# ELK:Elasticsearch + Logstash + Kibana
# 日志收集流程
# 应用日志 → Filebeat(采集)→ Logstash(处理)→ Elasticsearch(存储)→ Kibana(展示)
# 1. Filebeat 配置(filebeat.yml)
filebeat.inputs:
- type: log
paths:
- /var/log/app/*.log
multiline:
pattern: '^\d{4}-\d{2}-\d{2}'
negate: true
match: after
# 2. Logstash 配置
input {
beats { port => 5044 }
}
filter {
grok {
match => { "message" => "%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:msg}" }
}
}
output {
elasticsearch {
hosts => ["localhost:9200"]
index => "app-logs-%{+YYYY.MM.dd}"
}
}
# 3. Kibana 查询
# 按 TraceId 搜索完整调用链日志
# 按时间范围统计错误率
# 按服务名过滤日志
告警体系
# Prometheus AlertManager 告警规则
groups:
- name: spring-boot-alerts
rules:
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "服务 {{ $labels.instance }} 宕机"
- alert: HighErrorRate
expr: rate(http_server_requests_seconds_count{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "错误率超过 10%"
# 告警通知渠道
# 钉钉、企业微信、邮件、短信、PagerDuty
实战场景
场景:自定义健康检查
# 自定义健康指示器
@Component
public class RedisHealthIndicator implements HealthIndicator {
@Autowired
private RedisTemplate redisTemplate;
@Override
public Health health() {
try {
redisTemplate.opsForValue().get("health-check");
return Health.up()
.withDetail("redis", "connected")
.build();
} catch (Exception e) {
return Health.down()
.withDetail("redis", "disconnected")
.withDetail("error", e.getMessage())
.build();
}
}
}
# /actuator/health 返回
# {
# "status": "UP",
# "components": {
# "redis": {"status": "UP", "details": {"redis": "connected"}},
# "db": {"status": "UP"},
# "diskSpace": {"status": "UP"}
# }
# }
面试模拟
面试官:微服务监控怎么做?
你:三层监控:指标监控用 Prometheus + Grafana(CPU/内存/QPS/RT),健康检查用 Actuator,日志聚合用 ELK。Spring Boot Admin 做可视化监控平台。告警用 Prometheus AlertManager 对接钉钉/邮件。
面试官:如何监控 JVM 状态?
你:通过 Micrometer + Prometheus 采集 JVM 指标(内存使用、GC 暂停时间、线程数、类加载数),Grafana 可视化展示。设置告警规则:堆内存使用率超过 80%、GC 频率异常等。