This commit is contained in:
2025-06-07 14:09:21 -03:00
commit f89d835c47
11 changed files with 1038 additions and 0 deletions

149
monitoring/README.md Normal file
View File

@@ -0,0 +1,149 @@
# OpenCand Monitoring Setup
This monitoring stack provides comprehensive log aggregation and visualization for the OpenCand project, with special focus on the ETL service.
## Services Overview
### 🔍 **Grafana Loki** (Port 3100)
- **Purpose**: Log aggregation and storage
- **Access**: http://localhost:3100
- **Description**: Collects and stores all container logs in a structured format
### 📊 **Grafana** (Port 3000)
- **Purpose**: Log visualization and dashboards
- **Access**: http://localhost:3000
- **Credentials**:
- Username: `admin`
- Password: `admin`
- **Pre-configured Dashboards**: OpenCand ETL Monitoring dashboard
### 📈 **Prometheus** (Port 9090)
- **Purpose**: Metrics collection and storage
- **Access**: http://localhost:9090
- **Description**: Collects system and application metrics
### 🖥️ **Node Exporter** (Port 9100)
- **Purpose**: System metrics collection
- **Access**: http://localhost:9100/metrics
- **Description**: Provides host system metrics (CPU, memory, disk, etc.)
### 🚚 **Promtail**
- **Purpose**: Log collection agent
- **Description**: Automatically discovers and ships Docker container logs to Loki
## Key Features
### ETL-Specific Monitoring
- ✅ Real-time ETL process logs
- ✅ Error tracking and alerting capabilities
- ✅ Performance metrics monitoring
- ✅ Data processing progress tracking
### Container Log Management
- ✅ Automatic log rotation (10MB max size, 3 files)
- ✅ Structured log labeling
- ✅ Multi-service log aggregation
### Pre-built Dashboards
- ✅ OpenCand ETL Logs viewer
- ✅ API logs monitoring
- ✅ Database logs tracking
- ✅ Container resource usage
## Getting Started
1. **Start the monitoring stack**:
```bash
docker-compose up -d
```
2. **Access Grafana**:
- Open http://localhost:3000
- Login with admin/admin
- Navigate to "Dashboards" → "OpenCand ETL Monitoring"
3. **View ETL Logs in Real-time**:
- In Grafana, go to "Explore"
- Select "Loki" as datasource
- Use query: `{container_name="opencand_etl"}`
4. **Monitor System Metrics**:
- Access Prometheus at http://localhost:9090
- View system metrics from Node Exporter
## Log Queries Examples
### ETL Service Logs
```logql
{container_name="opencand_etl"}
```
### Error Logs Only
```logql
{container_name="opencand_etl"} |= "ERROR"
```
### API Logs with Filtering
```logql
{container_name="opencand_api"} |= "Microsoft.AspNetCore"
```
### Database Connection Logs
```logql
{container_name="opencand_db"} |= "connection"
```
## Configuration Files
- **Loki**: `./monitoring/loki-config.yaml`
- **Promtail**: `./monitoring/promtail-config.yaml`
- **Prometheus**: `./monitoring/prometheus.yml`
- **Grafana Datasources**: `./monitoring/grafana/provisioning/datasources/`
- **Grafana Dashboards**: `./monitoring/grafana/provisioning/dashboards/`
## Data Persistence
The following volumes are created for data persistence:
- `loki-data`: Loki log storage
- `prometheus-data`: Prometheus metrics storage
- `grafana-data`: Grafana dashboards and settings
## Troubleshooting
### ETL Logs Not Appearing
1. Check if ETL container is running: `docker ps`
2. Verify Promtail is collecting logs: `docker logs opencand_promtail`
3. Check Loki status: `curl http://localhost:3100/ready`
### Grafana Dashboard Issues
1. Verify datasources are configured correctly
2. Check if Loki is accessible from Grafana container
3. Restart Grafana container: `docker-compose restart grafana`
### Performance Issues
1. Monitor disk usage for log storage
2. Adjust log retention in `loki-config.yaml`
3. Increase resource limits if needed
## Customization
### Adding More Dashboards
1. Create JSON dashboard files in `./monitoring/grafana/provisioning/dashboards/`
2. Restart Grafana container
### Log Retention Configuration
Edit `./monitoring/loki-config.yaml` to adjust retention policies:
```yaml
limits_config:
retention_period: 168h # 7 days
```
### Alert Configuration
Add alerting rules to Prometheus configuration for ETL failure notifications.
## Security Notes
- Change default Grafana admin password in production
- Restrict network access to monitoring ports
- Consider using authentication for external access
- Regularly update monitoring stack images

View File

@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards

View File

@@ -0,0 +1,197 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"datasource": "Loki",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false,
"prettifyLogMessage": false,
"enableLogDetails": true,
"dedupStrategy": "none"
},
"targets": [
{
"expr": "{container_name=\"opencand_etl\"}",
"refId": "A"
}
],
"title": "OpenCand ETL Logs",
"type": "logs"
},
{
"datasource": "Loki",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 2,
"options": {
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false,
"prettifyLogMessage": false,
"enableLogDetails": true,
"dedupStrategy": "none"
},
"targets": [
{
"expr": "{container_name=\"opencand_api\"}",
"refId": "A"
}
],
"title": "OpenCand API Logs",
"type": "logs"
},
{
"datasource": "Loki",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 3,
"options": {
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false,
"prettifyLogMessage": false,
"enableLogDetails": true,
"dedupStrategy": "none"
},
"targets": [
{
"expr": "{container_name=\"opencand_db\"}",
"refId": "A"
}
],
"title": "PostgreSQL Database Logs",
"type": "logs"
},
{
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {
"align": null,
"displayMode": "auto"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 16
},
"id": 4,
"options": {
"orientation": "auto",
"reduceOptions": {
"values": false,
"calcs": [
"lastNotNull"
],
"fields": ""
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"text": {}
},
"pluginVersion": "7.5.7",
"targets": [
{
"expr": "rate(container_cpu_usage_seconds_total{name=~\"opencand.*\"}[5m]) * 100",
"interval": "",
"legendFormat": "{{name}}",
"refId": "A"
}
],
"title": "Container CPU Usage (%)",
"type": "stat"
}
],
"schemaVersion": 27,
"style": "dark",
"tags": [
"opencand",
"etl",
"logs"
],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "OpenCand ETL Monitoring",
"uid": "opencand-etl",
"version": 1
}

View File

@@ -0,0 +1,16 @@
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
isDefault: false
jsonData:
maxLines: 1000
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true

View File

@@ -0,0 +1,49 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
#
# Statistics help us better understand how Loki is used, and they show us performance
# levels for most users. This helps us prioritize features and documentation.
# For more information on what's sent: https://github.com/grafana/loki/blob/main/docs/sources/configuration/telemetry.md
# Refer to the buildReport method to see what goes into a report.
#
# If you would like to disable reporting, uncomment the following lines:
#analytics:
# reporting_enabled: false

35
monitoring/prometheus.yml Normal file
View File

@@ -0,0 +1,35 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'opencand-api'
static_configs:
- targets: ['api:8080']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'docker-containers'
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: [__meta_docker_container_name]
regex: '/(.*)'
target_label: container_name
- source_labels: [__meta_docker_container_id]
target_label: container_id
- source_labels: [__meta_docker_container_label_com_docker_compose_service]
target_label: compose_service

View File

@@ -0,0 +1,71 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
# Docker container logs
- job_name: containers
static_configs:
- targets:
- localhost
labels:
job: containerlogs
__path__: /var/lib/docker/containers/*/*log
pipeline_stages:
- json:
expressions:
output: log
stream: stream
attrs:
- json:
source: attrs
expressions:
tag:
- regex:
source: tag
expression: (?P<container_name>(?:[^|]*))\|
- timestamp:
source: time
format: RFC3339Nano
- labels:
stream:
container_name:
- output:
source: output
# ETL specific logs
- job_name: etl-logs
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
filters:
- name: label
values: ["promtail.enable=true"]
relabel_configs:
- source_labels: ['__meta_docker_container_label_promtail_job']
target_label: 'job'
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
- source_labels: ['__meta_docker_container_log_stream']
target_label: 'stream'
pipeline_stages:
- json:
expressions:
output: log
stream: stream
timestamp: time
- timestamp:
source: timestamp
format: RFC3339Nano
- output:
source: output