From b1e4757c70e9f742186649a3a9a8159ca5aad47c Mon Sep 17 00:00:00 2001 From: Michael Still Date: Sun, 23 Aug 2020 20:47:14 +1000 Subject: [PATCH 1/3] Optionally be verboser. --- ansible/deployandtest.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/ansible/deployandtest.sh b/ansible/deployandtest.sh index ad74290..4a98f53 100755 --- a/ansible/deployandtest.sh +++ b/ansible/deployandtest.sh @@ -10,6 +10,7 @@ CLOUD=${1:-$CLOUD} TERRAFORM_VARS="" ANSIBLE_VARS="" +VERBOSE="-v" #### AWS if [ "$CLOUD" == "aws" ] || [ "$CLOUD" == "aws-single-node" ] @@ -216,6 +217,12 @@ else fi VARIABLES="$VARIABLES release=$RELEASE" +#### Mode selection, deploy or hotfix at this time +if [ -z "$MODE" ] +then + MODE="deploy" +fi + #### Default settings BOOTDELAY="${BOOTDELAY:-2}" ADMIN_PASSWORD="${ADMIN_PASSWORD:-Ukoh5vie}" @@ -233,6 +240,7 @@ ANSIBLE_VARS="$ANSIBLE_VARS ansible_root=$cwd" ANSIBLE_VARS="$ANSIBLE_VARS uniqifier=$UNIQIFIER" ANSIBLE_VARS="$ANSIBLE_VARS admin_password=$ADMIN_PASSWORD" ANSIBLE_VARS="$ANSIBLE_VARS floating_network_ipblock=$FLOATING_IP_BLOCK" +ANSIBLE_VARS="$ANSIBLE_VARS mode=$MODE" echo "VARIABLES: $VARIABLES" @@ -242,24 +250,24 @@ do ANSIBLE_VARS="$ANSIBLE_VARS $var" done -ansible-playbook -i hosts --extra-vars "$ANSIBLE_VARS" deploy.yml +ansible-playbook $VERBOSE -i hosts --extra-vars "$ANSIBLE_VARS" deploy.yml if [ -e terraform/$CLOUD/local.yml ] then - ansible-playbook -i hosts --extra-vars "$ANSIBLE_VARS" terraform/$CLOUD/local.yml + ansible-playbook $VERBOSE -i hosts --extra-vars "$ANSIBLE_VARS" terraform/$CLOUD/local.yml fi # Old fashioned ansible CI if [ "%$SKIP_SF_TESTS%" == "%%" ] then - ansible-playbook -i hosts --extra-vars "$ANSIBLE_VARS" ../ansible-ci/pretest.yml + ansible-playbook $VERBOSE -i hosts --extra-vars "$ANSIBLE_VARS" ../ansible-ci/pretest.yml for playbook in `ls ../ansible-ci/tests/test_*.yml | grep -v test_final.yml | shuf` do - ansible-playbook -i hosts --extra-vars "$ANSIBLE_VARS" $playbook + ansible-playbook $VERBOSE -i hosts --extra-vars "$ANSIBLE_VARS" $playbook done - ansible-playbook -i hosts --extra-vars "$ANSIBLE_VARS" ../ansible-ci/tests/test_final.yml + ansible-playbook $VERBOSE -i hosts --extra-vars "$ANSIBLE_VARS" ../ansible-ci/tests/test_final.yml # New fangled python CI - ansible-playbook -i hosts --extra-vars "$ANSIBLE_VARS" test.yml + ansible-playbook $VERBOSE -i hosts --extra-vars "$ANSIBLE_VARS" test.yml fi From c5239ce703bb2e4e264c40703585481ea73e1931 Mon Sep 17 00:00:00 2001 From: Michael Still Date: Tue, 25 Aug 2020 19:11:56 +1000 Subject: [PATCH 2/3] Add a hotfix option (untested) and turn off IPv6. --- README.md | 1 + .../tests/_util_network_cirros_validate.yml | 6 + ansible-ci/tests/test_cloudinit.yml | 6 + ...test_instances_start_with_dhcp_correct.yml | 1 + ansible-ci/tests/test_networking.yml | 2 + ansible-ci/tests/test_state_changes.yml | 72 +++++----- ansible/deploy.yml | 123 ++++++++++++++---- 7 files changed, 151 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index d4e5e85..ff6008a 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ with real users. | Option | Terraform definition | Description | |--------|----------------------|-------------| +| MODE | All | Options are "deploy" (the default) or "hotfix". Deploy performs a full install, whereas hotfix skips steps to try and push only changes to Shaken Fist code as quickly as possible. | | CLOUD | All | The terraform definition to use | | ADMIN_PASSWORD | All | The admin password for the cloud once installed | | FLOATING_IP_BLOCK | All | The IP range to use for the floating network | diff --git a/ansible-ci/tests/_util_network_cirros_validate.yml b/ansible-ci/tests/_util_network_cirros_validate.yml index 5f9acea..a0c9c81 100644 --- a/ansible-ci/tests/_util_network_cirros_validate.yml +++ b/ansible-ci/tests/_util_network_cirros_validate.yml @@ -7,6 +7,12 @@ content: | #!/bin/bash -e + while [ `sf-client instance events {{uuid}} | grep -c "login prompt"` -lt 1 ] + do + echo "Waiting for login prompt..." + sleep 10 + done + [ `/opt/telnet_client.py {{console_port}} ifconfig eth0 | grep -c "{{netblock_octets}}"` -eq 1 ] [ `/opt/telnet_client.py {{console_port}} cat /etc/resolv.conf | grep -c "8.8.8.8"` -eq 1 ] [ `/opt/telnet_client.py {{console_port}} netstat -rn | grep -c "{{netblock_octets}}.1"` -eq 1 ] diff --git a/ansible-ci/tests/test_cloudinit.yml b/ansible-ci/tests/test_cloudinit.yml index 1aaeaec..03ff360 100644 --- a/ansible-ci/tests/test_cloudinit.yml +++ b/ansible-ci/tests/test_cloudinit.yml @@ -36,6 +36,12 @@ content: | #!/bin/bash -e + while [ `sf-client instance events {{cirros_with_extras_uuid}} | grep -c "login prompt"` -lt 1 ] + do + echo "Waiting for login prompt..." + sleep 10 + done + [ `/opt/telnet_client.py {{cirros_with_extras_console_port}} exists /home/cirros/output.txt | grep -c "File exists"` -eq 2 ] [ `/opt/telnet_client.py {{cirros_with_extras_console_port}} exists /home/cirros/.ssh/authorized_keys | grep -c "File exists"` -eq 2 ] dest: /tmp/shell_script diff --git a/ansible-ci/tests/test_instances_start_with_dhcp_correct.yml b/ansible-ci/tests/test_instances_start_with_dhcp_correct.yml index ac1a6dc..6b23199 100644 --- a/ansible-ci/tests/test_instances_start_with_dhcp_correct.yml +++ b/ansible-ci/tests/test_instances_start_with_dhcp_correct.yml @@ -108,6 +108,7 @@ tasks: - include: _util_network_cirros_validate.yml console_port="{{cirros_console_port}}" + uuid="{{cirros_uuid}}" netblock_octets="192.168.242" - hosts: sf-1 diff --git a/ansible-ci/tests/test_networking.yml b/ansible-ci/tests/test_networking.yml index edf517c..7e197b7 100644 --- a/ansible-ci/tests/test_networking.yml +++ b/ansible-ci/tests/test_networking.yml @@ -138,6 +138,7 @@ - include: _util_network_cirros_validate.yml console_port="{{cirros_net_two_console_port}}" + uuid="{{cirros_net_two_uuid}}" netblock_octets="192.168.240" - name: Check we can't talk to the other virtual network @@ -179,6 +180,7 @@ - include: _util_network_cirros_validate.yml console_port="{{cirros_net_three_console_port}}" + uuid="{{cirros_net_three_uuid}}" netblock_octets="192.168.242" - name: Check we can't talk to the other virtual network diff --git a/ansible-ci/tests/test_state_changes.yml b/ansible-ci/tests/test_state_changes.yml index 0aa2a31..df28f16 100644 --- a/ansible-ci/tests/test_state_changes.yml +++ b/ansible-ci/tests/test_state_changes.yml @@ -10,37 +10,41 @@ connection: ssh tasks: - - include: _util_instance_reboot.yml - uuid={{cirros_uuid}} - type=soft - - - include: _util_network_cirros_validate.yml - console_port="{{cirros_console_port}}" - netblock_octets="192.168.242" - - - include: _util_instance_reboot.yml - uuid={{cirros_uuid}} - type=hard - - - include: _util_network_cirros_validate.yml - console_port="{{cirros_console_port}}" - netblock_octets="192.168.242" - - - include: _util_instance_halt.yml - uuid={{cirros_uuid}} - action_one=poweroff - action_two=poweron - - - include: _util_network_cirros_validate.yml - console_port="{{cirros_console_port}}" - netblock_octets="192.168.242" - - - include: _util_instance_halt.yml - uuid={{cirros_uuid}} - ip={{cirros_uuid}} - action_one=pause - action_two=unpause - - - include: _util_network_cirros_validate.yml - console_port="{{cirros_console_port}}" - netblock_octets="192.168.242" + - include: _util_instance_reboot.yml + uuid={{cirros_uuid}} + type=soft + + - include: _util_network_cirros_validate.yml + console_port="{{cirros_console_port}}" + uuid="{{cirros_uuid}}" + netblock_octets="192.168.242" + + - include: _util_instance_reboot.yml + uuid={{cirros_uuid}} + type=hard + + - include: _util_network_cirros_validate.yml + console_port="{{cirros_console_port}}" + uuid="{{cirros_uuid}}" + netblock_octets="192.168.242" + + - include: _util_instance_halt.yml + uuid={{cirros_uuid}} + action_one=poweroff + action_two=poweron + + - include: _util_network_cirros_validate.yml + console_port="{{cirros_console_port}}" + uuid="{{cirros_uuid}}" + netblock_octets="192.168.242" + + - include: _util_instance_halt.yml + uuid={{cirros_uuid}} + ip={{cirros_uuid}} + action_one=pause + action_two=unpause + + - include: _util_network_cirros_validate.yml + console_port="{{cirros_console_port}}" + uuid="{{cirros_uuid}}" + netblock_octets="192.168.242" diff --git a/ansible/deploy.yml b/ansible/deploy.yml index 6b1cccb..669be9c 100644 --- a/ansible/deploy.yml +++ b/ansible/deploy.yml @@ -4,9 +4,11 @@ connection: ssh vars: release: git + mode: deploy tasks: - include_tasks: tasks/distro-check.yml + when: mode == "deploy" - name: Load default vars include_vars: main.yml @@ -30,12 +32,15 @@ - name: Generate a random auth secret set_fact: auth_secret: "{{ lookup('password', '/dev/null length=30 chars=ascii_letters') }}" + when: mode == "deploy" - include: terraform/{{cloud}}/terraform.yml + when: mode == "deploy" - name: Wait for instances to boot pause: minutes: "{{bootdelay}}" + when: mode == "deploy" - hosts: hypervisors any_errors_fatal: true @@ -43,6 +48,9 @@ become_method: sudo gather_facts: no connection: ssh + vars: + release: git + mode: deploy tasks: - include: terraform/{{cloud}}/postboot.yml @@ -53,6 +61,9 @@ become_method: sudo gather_facts: no connection: ssh + vars: + release: git + mode: deploy tasks: - name: Write syslog file @@ -62,12 +73,46 @@ owner: root group: sudo mode: u=r,g=r,o= + when: mode == "deploy" - name: Restart syslog service: name: rsyslog enabled: yes state: restarted + when: mode == "deploy" + + - name: Configure KSM to be sensible on boot + copy: + content: | + w /sys/kernel/mm/ksm/run - - - - 1 + w /sys/kernel/mm/ksm/pages_to_scan - - - - 1000000 + w /sys/kernel/mm/ksm/merge_across_nodes - - - - 0 + dest: /etc/tmpfiles.d/sf-ksm.conf + owner: root + mode: u=r,g=r,o=r + + # merge_across_nodes requires a reboot, so is skipped below + - name: Configure KSM to be sensible now + shell: | + echo "1" > /sys/kernel/mm/ksm/run + echo "100000" > /sys/kernel/mm/ksm/pages_to_scan + ignore_errors: True + + - name: Configure IPv6 to be disabled on boot + copy: + content: | + net.ipv6.conf.all.disable_ipv6 = 1 + net.ipv6.conf.default.disable_ipv6 = 1 + dest: /etc/sysctl.d/10-sf-ipv6.conf + owner: root + mode: u=r,g=r,o=r + + - name: Configure IPv6 to be disabled now + shell: | + sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sysctl -w net.ipv6.conf.default.disable_ipv6=1 + ignore_errors: True - hosts: allsf any_errors_fatal: true @@ -75,11 +120,15 @@ become_method: sudo gather_facts: yes connection: ssh + vars: + release: git + mode: deploy tasks: - name: Syslog server is DB server set_fact: syslog: "{{hostvars['localhost']['database_node_ip']}}" + when: mode == "deploy" - name: Send syslog to the DB server, unless I am the DB server template: @@ -88,22 +137,24 @@ owner: root group: sudo mode: u=r,g=r,o= - when: hostvars['localhost']['database_node_ip'] != node_ip + when: hostvars['localhost']['database_node_ip'] != node_ip and mode == "deploy" - name: Restart syslog service: name: rsyslog enabled: yes state: restarted - when: hostvars['localhost']['database_node_ip'] != node_ip + when: hostvars['localhost']['database_node_ip'] != node_ip and mode == "deploy" - name: Load default vars include_vars: main.yml - include: includes/debian.yml + when: mode == "deploy" - name: Check that we can run KVM shell: kvm-ok + when: mode == "deploy" - name: Determine default interface shell: ip route list default | head -1 | cut -f 5 -d " " @@ -118,25 +169,29 @@ - name: Determine default interface MTU shell: ip link show {{node_egress_nic}} | grep mtu | sed -e 's/.*mtu //' -e 's/ .*//' register: node_mtu_complex + when: mode == "deploy" - name: Extract default interface MTU set_fact: node_mtu: "{{node_mtu_complex.stdout}}" + when: mode == "deploy" - name: Log node MTU debug: msg: "Node MTU is {{node_mtu}}" + when: mode == "deploy" - name: Abort if default interface MTU is too low fail: msg: "Node MTU is too low." - when: node_mtu|int < 2000 and cloud != "gcp" + when: node_mtu|int < 2000 and cloud != "gcp" and mode == "deploy" - name: Make /srv/shakenfist/ file: path: /srv/shakenfist state: directory mode: "0755" + when: mode == "deploy" # Install etcd on all nodes, with members of etcd-master being voting nodes - hosts: etcd @@ -146,6 +201,7 @@ etcd_cluster_name: shakenfist etcd_enable_v2: False etcd_master_group_name: etcd_master + when: mode == "deploy" - hosts: allsf any_errors_fatal: true @@ -155,14 +211,17 @@ connection: ssh vars: release: git + mode: deploy tasks: - include: includes/python3.yml + when: mode == "deploy" - name: Ensure the source directory is absent file: path: /srv/shakenfist/src/ state: absent + when: mode == "deploy" - name: Remove old wheels file: @@ -210,16 +269,18 @@ path: /etc/sf state: directory mode: "0755" + when: mode == "deploy" - name: Set system key from extra-vars set_fact: system_key: "{{admin_password}}" + when: mode == "deploy" - name: Use Hashicorp Vault for "system" namespace key (if enabled) block: - set_fact: system_key: "{{lookup('hashivault', '{{vault_system_key_path}}', 'key')}}" - when: vault_system_key_path is defined + when: vault_system_key_path is defined and mode == "deploy" rescue: - fail: @@ -232,6 +293,7 @@ owner: root group: sudo mode: u=r,g=r,o= + when: mode == "deploy" - name: Install sfrc for root user lineinfile: @@ -239,6 +301,7 @@ create: yes regexp: ". /etc/sf/sfrc" line: ". /etc/sf/sfrc" + when: mode == "deploy" - name: Write a global auth file template: @@ -247,6 +310,7 @@ owner: root group: sudo mode: u=r,g=r,o= + when: mode == "deploy" - hosts: db any_errors_fatal: true @@ -254,16 +318,21 @@ become_method: sudo gather_facts: no connection: ssh + vars: + release: git + mode: deploy tasks: - name: Install prometheus apt: name: prometheus state: latest + when: mode == "deploy" - name: Read local prometheus customizations set_fact: prom_additional: "{{lookup('file', 'terraform/{{cloud}}/prometheus.yml') }}" + when: mode == "deploy" - name: Write prometheus configuration file copy: @@ -306,19 +375,17 @@ dest: /etc/prometheus/prometheus.yml owner: root mode: u=rw,g=r,o=r + when: mode == "deploy" - name: Restart prometheus service: name: prometheus enabled: yes state: restarted + when: mode == "deploy" - include: includes/grafana.yml - - # - name: Remove previous grafana auth - # file: - # path: /var/lib/grafana/grafana.db - # state: absent + when: mode == "deploy" - name: Write grafana config template: @@ -326,6 +393,7 @@ dest: /etc/grafana/grafana.ini owner: root mode: u=rw,g=r,o=r + when: mode == "deploy" - name: Write grafana dashboard copy: @@ -333,6 +401,7 @@ dest: /etc/grafana/provisioning/dashboards/shakenfist.json owner: root mode: u=rw,g=r,o=r + when: mode == "deploy" - name: Write grafana dashboard config copy: @@ -340,6 +409,7 @@ dest: /etc/grafana/provisioning/dashboards/dashboards.yaml owner: root mode: u=rw,g=r,o=r + when: mode == "deploy" - name: Write prometheus grafana configuration file copy: @@ -360,15 +430,18 @@ dest: /etc/grafana/provisioning/datasources/prometheus.yml owner: root mode: u=rwx,g=r,o=r + when: mode == "deploy" - name: Restart grafana service: name: grafana-server enabled: yes state: restarted + when: mode == "deploy" - name: Create an admin namespace called "system" with one key configured shell: sf-passwd deploy "{{system_key}}" + when: mode == "deploy" - hosts: hypervisors any_errors_fatal: true @@ -376,6 +449,9 @@ become_method: sudo gather_facts: no connection: ssh + vars: + release: git + mode: deploy tasks: - name: Create storage directory @@ -383,6 +459,7 @@ path: /srv/shakenfist state: directory mode: "0755" + when: mode == "deploy" - name: Copy libvirt template copy: @@ -396,6 +473,7 @@ shell: | virsh net-destroy default ignore_errors: True + when: mode == "deploy" - name: Copy dhcp config template copy: @@ -423,23 +501,6 @@ node_ip: "{{node_ip_complex.stdout}}" when: node_ip is not defined - - name: Configure KSM to be sensible on boot - copy: - content: | - w /sys/kernel/mm/ksm/run - - - - 1 - w /sys/kernel/mm/ksm/pages_to_scan - - - - 1000000 - w /sys/kernel/mm/ksm/merge_across_nodes - - - - 0 - dest: /etc/tmpfiles.d/sf-ksm.conf - owner: root - mode: u=r,g=r,o=r - - # merge_across_nodes requires a reboot, so is skipped below - - name: Configure KSM to be sensible now - shell: | - echo "1" > /sys/kernel/mm/ksm/run - echo "100000" > /sys/kernel/mm/ksm/pages_to_scan - ignore_errors: True - - name: Write systemd unit template: src: files/sf.service @@ -459,8 +520,10 @@ roles: - role: andrewrothstein.terraform terraform_binary_dir: /usr/local/bin/terraform_install + when: mode == "deploy" - role: andrewrothstein.go + when: mode == "deploy" - hosts: db any_errors_fatal: true @@ -468,24 +531,31 @@ become_method: sudo gather_facts: no connection: ssh + vars: + release: git + mode: deploy tasks: - include: includes/ansible.yml + when: mode == "deploy" - name: Clear out old terraform providers file: path: /srv/shakenfist/terraform-provider-shakenfist state: absent + when: mode == "deploy" - name: Clone the terraform provider git: repo: https://github.com/shakenfist/terraform-provider-shakenfist dest: /srv/shakenfist/terraform-provider-shakenfist + when: mode == "deploy" - name: Build terraform provider shell: cmd: /usr/local/go/bin/go build chdir: /srv/shakenfist/terraform-provider-shakenfist + when: mode == "deploy" - name: Install terraform provider copy: @@ -493,6 +563,7 @@ dest: /usr/local/bin/terraform_install/terraform-provider-shakenfist remote_src: yes mode: u+rx,g+rx,o+rx + when: mode == "deploy" - name: Run any upgrade steps shell: sf-upgrade From 38b736a5a0f28dd84d8c86dea0badec444f0c05e Mon Sep 17 00:00:00 2001 From: Michael Still Date: Tue, 1 Sep 2020 15:59:59 +1000 Subject: [PATCH 3/3] Wait for deletes a little longer. --- ansible-ci/tests/_setup.yml | 24 +++++++++++++++++++++++ ansible-ci/tests/_util_instance_start.yml | 5 +++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/ansible-ci/tests/_setup.yml b/ansible-ci/tests/_setup.yml index 677ec8c..8416fd4 100644 --- a/ansible-ci/tests/_setup.yml +++ b/ansible-ci/tests/_setup.yml @@ -5,11 +5,35 @@ sf-client --simple instance delete $inst done + attempts=0 + while [ `sf-client --simple instance list | grep -v uuid | wc -l` -gt 0 ] + do + attempts=$(( $attempts + 1)) + sleep 10 + if [ $attempts -gt 10 ] + then + echo "Failing after instance deletes did not complete" + exit 1 + fi + done + for net in `sf-client --simple network list | grep -v uuid | cut -f 1 -d ","` do sf-client --simple network delete $net done + attempts=0 + while [ `sf-client --simple network list | grep -v uuid | wc -l` -gt 0 ] + do + attempts=$(( $attempts + 1)) + sleep 10 + if [ $attempts -gt 10 ] + then + echo "Failing after network deletes did not complete" + exit 1 + fi + done + for namespace in `sf-client --simple namespace list | grep -v uuid | grep -v system` do sf-client --simple namespace delete $namespace diff --git a/ansible-ci/tests/_util_instance_start.yml b/ansible-ci/tests/_util_instance_start.yml index f8ab6c0..db65476 100644 --- a/ansible-ci/tests/_util_instance_start.yml +++ b/ansible-ci/tests/_util_instance_start.yml @@ -23,8 +23,9 @@ debug: msg: "{{instance_create_out}}" -- name: Pause very briefly - shell: sleep 10 +- name: Wait for instance to boot + pause: + minutes: 2 - name: Count the number of VMs after virt: