From 6c9b0b889f1a8b1610ab900b7a949eb095182ac0 Mon Sep 17 00:00:00 2001 From: Amit Kumar Das <40661238+amityt@users.noreply.github.com> Date: Mon, 5 Jul 2021 09:52:39 +0530 Subject: [PATCH] Add litmus-portal dashboards in monitoring directory (#478) Signed-off-by: Amit Kumar Das --- .../litmus-portal/node_metrics.json | 144 +++++ .../dashboards/litmus-portal/pod_metrics.json | 73 +++ .../dashboards/litmus-portal/sock-shop.json | 550 ++++++++++++++++++ 3 files changed, 767 insertions(+) create mode 100644 monitoring/dashboards/litmus-portal/node_metrics.json create mode 100644 monitoring/dashboards/litmus-portal/pod_metrics.json create mode 100644 monitoring/dashboards/litmus-portal/sock-shop.json diff --git a/monitoring/dashboards/litmus-portal/node_metrics.json b/monitoring/dashboards/litmus-portal/node_metrics.json new file mode 100644 index 0000000..f6cff8f --- /dev/null +++ b/monitoring/dashboards/litmus-portal/node_metrics.json @@ -0,0 +1,144 @@ +{ + "dashboardID": "generic_node_metrics", + "name": "Node metrics", + "information": "This dashboard visualizes Node level CPU, memory, disk and IO utilization metrics interleaved with chaos events.", + "chaosEventQueryTemplate": "litmuschaos_awaited_experiments{job=\"chaos-exporter\"}", + "chaosVerdictQueryTemplate": "litmuschaos_experiment_verdict{job=\"chaos-exporter\"}", + "panelGroupMap": [ + { + "groupName": "CPU Utilization Metrics", + "panels": ["Chaos-Node-CPU Utilization"] + }, + { + "groupName": "Memory Utilization Metrics", + "panels": ["Chaos-Node-Memory Utilization"] + }, + { + "groupName": "Disk Usage Metrics", + "panels": [ + "Chaos-Node-Disk I/O Usage R/W", + "Chaos-Node-Disk I/O Usage Times" + ] + }, + { + "groupName": "Network Usage Metrics", + "panels": [ + "Chaos-Node-Network Traffic Bytes", + "Chaos-Node-Network Traffic Packets" + ] + } + ], + "panelGroups": [ + { + "panel_group_name": "CPU Usage Metrics", + "panels": [ + { + "panel_name": "Chaos-Node-CPU Utilization", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "Cores", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "%", + "prom_queries": [ + { + "prom_query_name": "instance:node_cpu_utilisation:rate1m*100", + "legend": "{{instance}}", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "Memory Usage Metrics", + "panels": [ + { + "panel_name": "Chaos-Node-Memory Utilization", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "Memory", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "%", + "prom_queries": [ + { + "prom_query_name": "instance:node_memory_utilisation:ratio*100", + "legend": "{{instance}}", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "Disk Usage Metrics", + "panels": [ + { + "panel_name": "Chaos-Node-Disk I/O Usage R/W", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "bytes read (-) / write (+)", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "KiB", + "prom_queries": [ + { + "prom_query_name": "node_disk_read_bytes_total", + "legend": "{{instance}} - {{device}} - Successfully read bytes", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "node_disk_written_bytes_total", + "legend": "{{instance}} - {{device}} - Successfully written bytes", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + }, + { + "panel_name": "Chaos-Node-Disk I/O Usage Times", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "time", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "ms", + "prom_queries": [ + { + "prom_query_name": "node_disk_io_time_seconds_total", + "legend": "{{instance}} - {{device}} - Time spent doing I/Os", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + } + ] +} diff --git a/monitoring/dashboards/litmus-portal/pod_metrics.json b/monitoring/dashboards/litmus-portal/pod_metrics.json new file mode 100644 index 0000000..c2adf50 --- /dev/null +++ b/monitoring/dashboards/litmus-portal/pod_metrics.json @@ -0,0 +1,73 @@ +{ + "dashboardID": "generic_pod_metrics", + "name": "Pod metrics", + "information": "This dashboard visualizes Pod level CPU and memory usage metrics interleaved with chaos events.", + "chaosEventQueryTemplate": "litmuschaos_awaited_experiments{job=\"chaos-exporter\"}", + "chaosVerdictQueryTemplate": "litmuschaos_experiment_verdict{job=\"chaos-exporter\"}", + "panelGroupMap": [ + { + "groupName": "CPU Usage Metrics", + "panels": ["Chaos-Pod-CPU Usage"] + }, + { + "groupName": "Memory Usage Metrics", + "panels": ["Chaos-Pod-Memory Usage"] + } + ], + "panelGroups": [ + { + "panel_group_name": "CPU Usage Metrics", + "panels": [ + { + "panel_name": "Chaos-Pod-CPU Usage", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "Cores", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "", + "prom_queries": [ + { + "prom_query_name": "sum(rate(container_cpu_usage_seconds_total{container!=\"POD\",pod!=\"\"}[5m])) by (pod)", + "legend": "{{pod}}", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "Memory Usage Metrics", + "panels": [ + { + "panel_name": "Chaos-Pod-Memory Usage", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "Memory", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "GiB", + "prom_queries": [ + { + "prom_query_name": "sum(container_memory_usage_bytes{container!=\"POD\",container!=\"\"}) by (pod)", + "legend": "{{pod}}", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + } + ] +} diff --git a/monitoring/dashboards/litmus-portal/sock-shop.json b/monitoring/dashboards/litmus-portal/sock-shop.json new file mode 100644 index 0000000..39580fc --- /dev/null +++ b/monitoring/dashboards/litmus-portal/sock-shop.json @@ -0,0 +1,550 @@ +{ + "dashboardID": "sock-shop", + "name": "Sock Shop", + "information": "This dashboard visualizes Sock Shop application metrics metrics interleaved with chaos events and chaos exporter metrics.", + "chaosEventQueryTemplate": "litmuschaos_awaited_experiments{job=\"chaos-exporter\"}", + "chaosVerdictQueryTemplate": "litmuschaos_experiment_verdict{job=\"chaos-exporter\"}", + "panelGroupMap": [ + { + "groupName": "Orders Metrics", + "panels": ["Orders QPS", "Orders Latency"] + }, + { + "groupName": "Catalogue Metrics", + "panels": ["Catalogue QPS", "Catalogue Latency"] + }, + { + "groupName": "Payment Metrics", + "panels": ["Payment QPS", "Payment Latency"] + }, + { + "groupName": "Shipping Metrics", + "panels": ["Shipping QPS", "Shipping Latency"] + }, + { + "groupName": "User Metrics", + "panels": ["User QPS", "User Latency"] + }, + { + "groupName": "Frontend Metrics", + "panels": ["Frontend QPS", "Frontend Latency"] + }, + { + "groupName": "Cart Metrics", + "panels": ["Cart QPS", "Cart Latency"] + } + ], + "panelGroups": [ + { + "panel_group_name": "Orders Metrics", + "panels": [ + { + "panel_name": "Orders QPS", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "QPS (1 min)", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "qps", + "prom_queries": [ + { + "prom_query_name": "sum(rate(request_duration_seconds_count{job=\"orders\",status_code=~\"2..\",route!=\"metrics\"}[1m])) * 100", + "legend": "2xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_count{ job=\"orders\", status_code=~\"4.+|5.+\" }[1m])) * 100", + "legend": "4xx/5xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + }, + { + "panel_name": "Orders Latency", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "time", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "ms", + "prom_queries": [ + { + "prom_query_name": "histogram_quantile(0.99, sum(rate(request_duration_seconds_bucket{job=\"orders\"}[1m])) by (name, le))", + "legend": "99th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket{job=\"orders\"}[1m])) by (name, le))", + "legend": "50th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_sum{job=\"orders\"}[1m])) / sum(rate(request_duration_seconds_count{job=\"orders\"}[1m]))", + "legend": "Mean", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "Catalogue Metrics", + "panels": [ + { + "panel_name": "Catalogue QPS", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "QPS (1 min)", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "qps", + "prom_queries": [ + { + "prom_query_name": "sum(rate(request_duration_seconds_count{job=\"catalogue\",status_code=~\"2..\",route!=\"metrics\"}[1m])) * 100", + "legend": "2xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_count{ job=\"catalogue\", status_code=~\"4.+|5.+\" }[1m])) * 100", + "legend": "4xx/5xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + }, + { + "panel_name": "Catalogue Latency", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "time", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "ms", + "prom_queries": [ + { + "prom_query_name": "histogram_quantile(0.99, sum(rate(request_duration_seconds_bucket{job=\"catalogue\"}[1m])) by (name, le))", + "legend": "99th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket{job=\"catalogue\"}[1m])) by (name, le))", + "legend": "50th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_sum{job=\"catalogue\"}[1m])) / sum(rate(request_duration_seconds_count{job=\"catalogue\"}[1m]))", + "legend": "Mean", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "Payment Metrics", + "panels": [ + { + "panel_name": "Payment QPS", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "QPS (1 min)", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "qps", + "prom_queries": [ + { + "prom_query_name": "sum(rate(request_duration_seconds_count{job=\"payment\",status_code=~\"2..\",route!=\"metrics\"}[1m])) * 100", + "legend": "2xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_count{ job=\"payment\", status_code=~\"4.+|5.+\" }[1m])) * 100", + "legend": "4xx/5xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + }, + { + "panel_name": "Payment Latency", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "time", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "ms", + "prom_queries": [ + { + "prom_query_name": "histogram_quantile(0.99, sum(rate(request_duration_seconds_bucket{job=\"payment\"}[1m])) by (name, le))", + "legend": "99th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket{job=\"payment\"}[1m])) by (name, le))", + "legend": "50th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_sum{job=\"payment\"}[1m])) / sum(rate(request_duration_seconds_count{job=\"payment\"}[1m]))", + "legend": "Mean", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "Shipping Metrics", + "panels": [ + { + "panel_name": "Shipping QPS", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "QPS (1 min)", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "qps", + "prom_queries": [ + { + "prom_query_name": "sum(rate(request_duration_seconds_count{job=\"shipping\",status_code=~\"2..\",route!=\"metrics\"}[1m])) * 100", + "legend": "2xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_count{ job=\"shipping\", status_code=~\"4.+|5.+\" }[1m])) * 100", + "legend": "4xx/5xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + }, + { + "panel_name": "Shipping Latency", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "time", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "ms", + "prom_queries": [ + { + "prom_query_name": "histogram_quantile(0.99, sum(rate(request_duration_seconds_bucket{job=\"shipping\"}[1m])) by (name, le))", + "legend": "99th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket{job=\"shipping\"}[1m])) by (name, le))", + "legend": "50th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_sum{job=\"shipping\"}[1m])) / sum(rate(request_duration_seconds_count{job=\"shipping\"}[1m]))", + "legend": "Mean", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "User Metrics", + "panels": [ + { + "panel_name": "User QPS", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "QPS (1 min)", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "qps", + "prom_queries": [ + { + "prom_query_name": "sum(rate(request_duration_seconds_count{job=\"user\",status_code=~\"2..\",route!=\"metrics\"}[1m])) * 100", + "legend": "2xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_count{ job=\"user\", status_code=~\"4.+|5.+\" }[1m])) * 100", + "legend": "4xx/5xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + }, + { + "panel_name": "User Latency", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "time", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "ms", + "prom_queries": [ + { + "prom_query_name": "histogram_quantile(0.99, sum(rate(request_duration_seconds_bucket{job=\"user\"}[1m])) by (name, le))", + "legend": "99th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket{job=\"user\"}[1m])) by (name, le))", + "legend": "50th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_sum{job=\"user\"}[1m])) / sum(rate(request_duration_seconds_count{job=\"user\"}[1m]))", + "legend": "Mean", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "Frontend Metrics", + "panels": [ + { + "panel_name": "Frontend QPS", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "QPS (1 min)", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "qps", + "prom_queries": [ + { + "prom_query_name": "sum(rate(request_duration_seconds_count{job=\"front-end\",status_code=~\"2..\",route!=\"metrics\"}[1m])) * 100", + "legend": "2xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_count{ job=\"front-end\", status_code=~\"4.+|5.+\" }[1m])) * 100", + "legend": "4xx/5xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + }, + { + "panel_name": "Frontend Latency", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "time", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "ms", + "prom_queries": [ + { + "prom_query_name": "histogram_quantile(0.99, sum(rate(request_duration_seconds_bucket{job=\"front-end\"}[1m])) by (name, le))", + "legend": "99th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket{job=\"front-end\"}[1m])) by (name, le))", + "legend": "50th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_sum{job=\"front-end\"}[1m])) / sum(rate(request_duration_seconds_count{job=\"front-end\"}[1m]))", + "legend": "Mean", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + }, + { + "panel_group_name": "Cart Metrics", + "panels": [ + { + "panel_name": "Cart QPS", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "QPS (1 min)", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "qps", + "prom_queries": [ + { + "prom_query_name": "sum(rate(request_duration_seconds_count{job=\"carts\",status_code=~\"2..\",route!=\"metrics\"}[1m])) * 100", + "legend": "2xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_count{ job=\"carts\", status_code=~\"4.+|5.+\" }[1m])) * 100", + "legend": "4xx/5xx", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + }, + { + "panel_name": "Cart Latency", + "panel_options": { + "points": false, + "grids": true, + "left_axis": true + }, + "y_axis_left": "time", + "y_axis_right": "CHAOS", + "x_axis_down": "Time", + "unit": "ms", + "prom_queries": [ + { + "prom_query_name": "histogram_quantile(0.99, sum(rate(request_duration_seconds_bucket{job=\"carts\"}[1m])) by (name, le))", + "legend": "99th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket{job=\"carts\"}[1m])) by (name, le))", + "legend": "50th quantile", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + }, + { + "prom_query_name": "sum(rate(request_duration_seconds_sum{job=\"carts\"}[1m])) / sum(rate(request_duration_seconds_count{job=\"carts\"}[1m]))", + "legend": "Mean", + "resolution": "1/2", + "minstep": "5", + "line": true, + "close_area": false + } + ] + } + ] + } + ] +}