Advanced Grafana Dashboards: From Basic to Beautiful
Our Grafana dashboards were basic. Static queries, no context, hard to troubleshoot.
Built advanced dashboards with variables, annotations, and custom panels. MTTR 30min → 5min.
Table of Contents
Dashboard Variables
{
"templating": {
"list": [
{
"name": "environment",
"type": "custom",
"options": ["production", "staging", "development"]
},
{
"name": "namespace",
"type": "query",
"query": "label_values(kube_pod_info, namespace)",
"datasource": "Prometheus"
},
{
"name": "pod",
"type": "query",
"query": "label_values(kube_pod_info{namespace=\"$namespace\"}, pod)",
"datasource": "Prometheus"
}
]
}
}
Query with Variables:
# CPU usage for selected pod
rate(container_cpu_usage_seconds_total{
namespace="$namespace",
pod="$pod"
}[5m])
# Memory usage
container_memory_usage_bytes{
namespace="$namespace",
pod="$pod"
}
Annotations
{
"annotations": {
"list": [
{
"name": "Deployments",
"datasource": "Prometheus",
"expr": "changes(kube_deployment_status_observed_generation[5m]) > 0",
"tagKeys": "deployment",
"titleFormat": "Deployment: {{deployment}}",
"textFormat": "New version deployed"
},
{
"name": "Alerts",
"datasource": "Prometheus",
"expr": "ALERTS{alertstate=\"firing\"}",
"tagKeys": "alertname",
"titleFormat": "Alert: {{alertname}}",
"textFormat": "{{annotations.description}}"
}
]
}
}
Custom Panels
Heatmap for Latency Distribution:
{
"type": "heatmap",
"targets": [
{
"expr": "sum(rate(http_request_duration_seconds_bucket[5m])) by (le)",
"format": "heatmap",
"legendFormat": "{{le}}"
}
],
"yAxis": {
"format": "s",
"decimals": 2
},
"dataFormat": "tsbuckets"
}
Stat Panel with Thresholds:
{
"type": "stat",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{"value": 0, "color": "green"},
{"value": 1, "color": "yellow"},
{"value": 5, "color": "red"}
]
},
"unit": "percent"
}
}
}
Alert Rules
{
"alert": {
"name": "High Error Rate",
"conditions": [
{
"evaluator": {
"params": [5],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": ["A", "5m", "now"]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"frequency": "1m",
"handler": 1,
"message": "Error rate is above 5%",
"name": "High Error Rate",
"noDataState": "no_data",
"notifications": [
{"uid": "slack-notifications"}
]
}
}
Row Repeat
{
"panels": [
{
"type": "row",
"title": "Service: $service",
"repeat": "service",
"panels": [
{
"title": "Request Rate",
"targets": [
{
"expr": "rate(http_requests_total{service=\"$service\"}[5m])"
}
]
},
{
"title": "Error Rate",
"targets": [
{
"expr": "rate(http_requests_total{service=\"$service\",status=~\"5..\"}[5m])"
}
]
}
]
}
]
}
Python Dashboard Generator
import json
class GrafanaDashboard:
def __init__(self, title):
self.dashboard = {
"title": title,
"panels": [],
"templating": {"list": []},
"annotations": {"list": []}
}
def add_variable(self, name, query, datasource="Prometheus"):
"""Add template variable."""
self.dashboard["templating"]["list"].append({
"name": name,
"type": "query",
"query": query,
"datasource": datasource
})
def add_graph(self, title, queries, y_pos=0):
"""Add graph panel."""
panel = {
"type": "graph",
"title": title,
"targets": [
{"expr": q, "legendFormat": f"Query {i}"}
for i, q in enumerate(queries)
],
"gridPos": {"y": y_pos, "h": 8, "w": 12}
}
self.dashboard["panels"].append(panel)
def export(self, filename):
"""Export dashboard JSON."""
with open(filename, 'w') as f:
json.dump(self.dashboard, f, indent=2)
# Usage
dashboard = GrafanaDashboard("API Monitoring")
dashboard.add_variable(
"namespace",
"label_values(kube_pod_info, namespace)"
)
dashboard.add_graph(
"Request Rate",
[
'rate(http_requests_total{namespace="$namespace"}[5m])',
'rate(http_requests_total{namespace="$namespace",status=~"5.."}[5m])'
]
)
dashboard.export("api-dashboard.json")
Provisioning
# dashboards.yml
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
options:
path: /etc/grafana/provisioning/dashboards
Datasource Provisioning:
# datasources.yml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
- name: Loki
type: loki
access: proxy
url: http://loki:3100
Results
MTTR Improvement:
- Before: 30min average
- After: 5min average
- Improvement: 83%
Dashboard Features:
- Variables: Dynamic filtering
- Annotations: Deployment markers
- Alerts: Proactive notifications
- Custom panels: Better visualization
Team Productivity:
- Troubleshooting time: -83%
- False alerts: -60%
- Dashboard creation time: -70%
Lessons Learned
- Variables essential: Dynamic dashboards
- Annotations provide context: See deployments
- Alerts catch issues early: Proactive
- Custom panels better: Right visualization
- Provisioning automates: No manual work
Conclusion
Advanced Grafana dashboards transformed our monitoring. MTTR 30min → 5min, 83% improvement.
Key takeaways:
- MTTR: 30min → 5min (-83%)
- Variables: Dynamic filtering
- Annotations: Deployment context
- Alerts: Proactive monitoring
- Provisioning: Automated setup
Build advanced dashboards. Your team will thank you.