Skip to content

Commit

Permalink
Add monitoring stack (#36)
Browse files Browse the repository at this point in the history
This adds a Grafana + Prometheus + Postgres + Loki setup which is deployed via docker-compose and ansible to a remote machine (e.g. an EC2 instance)

2 docker images work to fill data in Postgres, one gathers data on test cases (only the failed tests are stored in pg) and the other gathers the hierarchy of data for Jenkins builds (jobs(`tvm`) -> builds (`main` or `PR-1234`) -> stages (`build: CPU`) -> steps (`run_a_script.sh`)), and Prometheus just scrapes Jenkins
  • Loading branch information
driazati committed Jun 14, 2022
1 parent e18a731 commit 09c5f7d
Show file tree
Hide file tree
Showing 22 changed files with 1,367 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ jobs:
VALIDATE_BASH: false
VALIDATE_TERRAFORM_TERRASCAN: false
VALIDATE_MARKDOWN: false
FILTER_REGEX_EXCLUDE: monitoring/.*
7 changes: 7 additions & 0 deletions monitoring/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
vars.yml
*.tar.gz
*.pem
__pycache__/
*.env
*.log
*.json
48 changes: 48 additions & 0 deletions monitoring/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
1. Install dependencies

```bash
pip install ansible

# For local installs
sudo apt install -y sshpass
```

2. Create a file called `vars.yml` that looks like

```yaml
passwords:
grafana_admin_username: 123
grafana_admin: 123
```

3. Generate keys

```bash
cd files
openssl req -newkey rsa:2048 -nodes -keyout key.pem -x509 -days 365 -out certificate.pem
```

4. Run the Ansible playbook to provision the machine

```bash
ansible-playbook -i <ssh remote>, install.yml [email protected]

# For local installs
ansible-playbook -i <ssh remote>, install.yml [email protected] -kK
```


## Debugging

```bash
# see why containers aren't up
sudo docker stack ps monitoring --no-trunc

# see grafana logs
sudo docker service logs monitoring_grafana --raw

# log into a container
sudo docker ps # get id
sudo docker exec -it <ID> /bin/bash
```

4 changes: 4 additions & 0 deletions monitoring/ansible.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPersist=60s
pipelining = True

112 changes: 112 additions & 0 deletions monitoring/files/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
version: '3.7'

services:
grafana:
image: grafana/grafana:8.4.4
logging:
driver: "json-file"
options:
max-size: "20m"
max-file: "10"
networks:
- monitoring
volumes:
- /etc/tvm/grafana:/var/lib/grafana
- /etc/tvm/grafana-provisioning/:/etc/grafana/provisioning
- /etc/tvm/email_template.html:/usr/share/grafana/public/emails/alert_notification.html
- /etc/tvm/grafana.ini:/etc/grafana/grafana.ini
- /etc/tvm/dashboards:/var/lib/grafana/dashboards

loki:
image: grafana/loki:main-52f9df4
logging:
driver: "json-file"
options:
max-size: "20m"
max-file: "10"
networks:
- monitoring
volumes:
- /etc/tvm/loki-config.yml:/etc/loki/loki-config.yaml
- /etc/tvm/loki_data:/data/loki
command: -config.file=/etc/loki/loki-config.yaml

postgres:
image: postgres:12.10
logging:
driver: "json-file"
options:
max-size: "20m"
max-file: "10"
networks:
- monitoring
environment:
- POSTGRES_USER={{ passwords.postgres_user }}
- POSTGRES_PASSWORD={{ passwords.postgres_password }}
volumes:
- /etc/tvm/postgres_data:/var/lib/postgresql/data

prometheus:
image: prom/prometheus:v2.34.0
volumes:
- /etc/tvm/prometheus.yml:/etc/prometheus/prometheus.yml
- /etc/tvm/prometheus/:/etc/prometheus/
- /etc/tvm/prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
# ports:
# - 9090:9090
networks:
- monitoring

fetcher:
image: jenkins_fetcher:latest
environment:
db_host: postgres:5432
db_user: {{ passwords.postgres_user }}
db_password: {{ passwords.postgres_password }}
loki_host: loki:3100
volumes:
- /etc/tvm/fetcher_data:/opt/fetcher/.httpcache
networks:
- monitoring
depends_on:
- postgres

testfetcher:
image: jenkins_testfetcher:latest
environment:
db_host: postgres:5432
db_user: {{ passwords.postgres_user }}
db_password: {{ passwords.postgres_password }}
loki_host: loki:3100
volumes:
- /etc/tvm/fetcher_data:/opt/fetcher/.httpcache
networks:
- monitoring
depends_on:
- postgres

nginx:
image: nginx:1.21.0
ports:
- "80:80"
- "443:443"
logging:
driver: "json-file"
options:
max-size: "20m"
max-file: "10"
networks:
- monitoring
volumes:
- "/etc/tvm/http.conf:/etc/nginx/conf.d/default.conf"
- "/etc/tvm/certificate.pem:/etc/nginx/fullchain.pem"
- "/etc/tvm/key.pem:/etc/nginx/privkey.pem"

networks:
monitoring:
external: false
21 changes: 21 additions & 0 deletions monitoring/files/email_template.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{% raw %}
<!DOCTYPE html>
<html xmlns="http:https://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width" />
</head>

<body>
<p>{{.Name}} [{{.State}}]</p>
<p>{{.Message}}</p>
<p>{{.Error}}</p>
<p>{{.RuleUrl}}</p>
<p>{{.AlertPageUrl}}</p>
<p>Values:</p>
{{range .EvalMatches}} &nbsp;&nbsp;{{.Metric}} = {{.Value}}<br />{{end}}
{{.ImageLink}}
</body>
</html>
{% endraw %}

21 changes: 21 additions & 0 deletions monitoring/files/grafana-provisioning/datasources/data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: 1
datasources:
- name: Loki
type: loki
# access: proxy
url: http:https://loki:3100
version: 1
- name: Prometheus
type: prometheus
# access: proxy
url: http:https://prometheus:9090
version: 1
- name: Postgres
type: postgres
url: postgres:5432
database: tvm
user: {{ passwords.postgres_user }}
secureJsonData:
password: {{ passwords.postgres_password }}
jsonData:
sslmode: disable
14 changes: 14 additions & 0 deletions monitoring/files/grafana.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[security]
admin_user = {{ passwords.grafana_admin_username }}
admin_password = {{ passwords.grafana_admin }}

[users]
allow_sign_up = false
# New users should be able to use explore and create/edit dashboards
auto_assign_org_role = Editor

[auth.anonymous]
enabled = false

[unified_alerting]
enabled = true
59 changes: 59 additions & 0 deletions monitoring/files/http.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
server {
listen [::]:80 ipv6only=off;
server_name /;

location / {
return 301 https://$host$request_uri;
}
}

upstream grafana {
server grafana;
}

server {
listen [::]:443 ipv6only=off ssl;
ssl_certificate /etc/nginx/fullchain.pem;
ssl_certificate_key /etc/nginx/privkey.pem;

client_max_body_size 500M;

set $grafana_upstream_endpoint http:https://grafana:3000;

# Adding a workaround for nginx rule https://grafana.com/blog/2021/11/03/grafana-8.2.3-released-with-medium-severity-security-fix-cve-2021-41174-grafana-xss/
location ~ \{\{ {
deny all;
}

location / {
resolver 127.0.0.11 valid=30s ipv6=off;
proxy_pass $grafana_upstream_endpoint;

proxy_set_header Host $host:$server_port;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Host $host;
proxy_set_header X-Forwarded-Port $server_port;
proxy_set_header X-Forwarded-Server $host:$server_port;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_hide_header X-Frame-Options;


proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";

proxy_max_temp_file_size 0;

client_max_body_size 100m;
client_body_buffer_size 128k;

proxy_connect_timeout 90;
proxy_send_timeout 90;
proxy_read_timeout 90;

proxy_buffer_size 4k;
proxy_buffers 4 32k;
proxy_busy_buffers_size 64k;
proxy_temp_file_write_size 64k;
}
}
37 changes: 37 additions & 0 deletions monitoring/files/loki-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
limits_config:
enforce_metric_name: false
reject_old_samples: false
# reject_old_samples_max_age: 168h
max_query_length: 0
ingestion_rate_mb: 100000
ingestion_burst_size_mb: 1000000
max_global_streams_per_user: 0
cardinality_limit: 10000000

auth_enabled: false

server:
http_listen_port: 3100
grpc_listen_port: 9096

common:
path_prefix: /tmp/loki
storage:
filesystem:
chunks_directory: /tmp/loki/chunks
rules_directory: /tmp/loki/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory

schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
19 changes: 19 additions & 0 deletions monitoring/files/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# my global config
global:
scrape_interval: 30s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "jenkins"
metrics_path: /prometheus
static_configs:
- targets: ["ci.tlcpack.ai"]
8 changes: 8 additions & 0 deletions monitoring/images/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:3.9.12

WORKDIR /opt/fetcher
COPY *.py requirements.txt ./

RUN python3 -m pip install -r requirements.txt

CMD python3 forward.py --forever --wait-minutes 15
8 changes: 8 additions & 0 deletions monitoring/images/Dockerfile.tests
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:3.9.12

WORKDIR /opt/fetcher
COPY *.py requirements.txt ./

RUN python3 -m pip install -r requirements.txt

CMD python3 tests_fetcher.py --forever --wait-minutes 15
Empty file added monitoring/images/__init__.py
Empty file.
Loading

0 comments on commit 09c5f7d

Please sign in to comment.