diff --git a/ansible/playbooks/telegraf.yaml b/ansible/playbooks/telegraf.yaml new file mode 100644 index 00000000000..a490821a23f --- /dev/null +++ b/ansible/playbooks/telegraf.yaml @@ -0,0 +1,4 @@ +- name: Set up Telegraf and InfluxDB v2 + hosts: localhost + roles: + - autoware.dev_env.telegraf diff --git a/ansible/roles/telegraf/README.md b/ansible/roles/telegraf/README.md new file mode 100644 index 00000000000..5355384e72a --- /dev/null +++ b/ansible/roles/telegraf/README.md @@ -0,0 +1,27 @@ +# telegraf + +This role install [InfluxDB v2](https://docs.influxdata.com/influxdb/v2/get-started/) and [Telegraf](https://docs.influxdata.com/telegraf/v1/) to collect system metrics. + +After installing telegraf and influxdb2, we need to manually make an API token and copy it to the `/etc/telegraf/telegraf.conf`. + +## Create Organization + +When accessing the InfluxDB UI for the first time, you need to create an organization and a backet. Enter as follows. + +![](./files/create-org.png) + +## Generate API token + +Next, move to the `Load Data` tab, click the `GENERATE API TOKEN button`, and generate the API token. + + + +![](./files/load-data.png) + +![](./files/generate-api-token.png) + +## Copy API token and restart Telegraf + +Paste the obtained API token into the empty string part of `token = ""` in `/etc/telegraf/telegraf.conf`. + + diff --git a/ansible/roles/telegraf/defaults/main.yaml b/ansible/roles/telegraf/defaults/main.yaml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ansible/roles/telegraf/files/create-org.png b/ansible/roles/telegraf/files/create-org.png new file mode 100644 index 00000000000..a286a54883c Binary files /dev/null and b/ansible/roles/telegraf/files/create-org.png differ diff --git a/ansible/roles/telegraf/files/generate-api-token.png b/ansible/roles/telegraf/files/generate-api-token.png new file mode 100644 index 00000000000..4b2bb26596d Binary files /dev/null and b/ansible/roles/telegraf/files/generate-api-token.png differ diff --git a/ansible/roles/telegraf/files/get_telegraf_proccpu_json.sh b/ansible/roles/telegraf/files/get_telegraf_proccpu_json.sh new file mode 100755 index 00000000000..bd70eef20c7 --- /dev/null +++ b/ansible/roles/telegraf/files/get_telegraf_proccpu_json.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +SAMPLING_SEC=5 + +echo "{" +pidstat -u -h -l "${SAMPLING_SEC}" 1 | + tail -n +4 | + awk '{ cpu=$8; $1=$2=$3=$4=$5=$6=$7=$8=$9=""; print cpu,$0 }' | + sort -n | + while read -r cpu cmd; do + if [[ ${cpu%%.*} -le 0 ]]; then + continue + fi + cmd="${cmd// /_}" + cmd="${cmd//=/_}" + cmd="${cmd:0:50}" + echo "\"${cmd}\":${cpu}," + done +echo '"z":0' +echo "}" diff --git a/ansible/roles/telegraf/files/get_telegraf_procmem_json.sh b/ansible/roles/telegraf/files/get_telegraf_procmem_json.sh new file mode 100755 index 00000000000..dd513aa3781 --- /dev/null +++ b/ansible/roles/telegraf/files/get_telegraf_procmem_json.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +echo "{" +ps -ax --format "rss command" | + while read -r rss cmd; do + if [[ $rss -lt 30000 ]]; then + continue + fi + cmd="${cmd// /_}" + cmd="${cmd//=/_}" + cmd="${cmd:0:50}" + echo "\"${cmd}\":${rss}," + done +echo '"z":0' +echo "}" diff --git a/ansible/roles/telegraf/files/load-data.png b/ansible/roles/telegraf/files/load-data.png new file mode 100644 index 00000000000..7094a0378ce Binary files /dev/null and b/ansible/roles/telegraf/files/load-data.png differ diff --git a/ansible/roles/telegraf/files/telegraf.conf b/ansible/roles/telegraf/files/telegraf.conf new file mode 100644 index 00000000000..9593f3f10c1 --- /dev/null +++ b/ansible/roles/telegraf/files/telegraf.conf @@ -0,0 +1,60 @@ +[global_tags] + +[agent] + interval = "10s" + round_interval = true + metric_batch_size = 1000 + metric_buffer_limit = 10000 + collection_jitter = "0s" + flush_interval = "10s" + flush_jitter = "0s" + precision = "0s" + hostname = "" + omit_hostname = false + +[[outputs.influxdb_v2]] + urls = ["http://127.0.0.1:8086"] + # Attach the access token. See also https://docs.influxdata.com/influxdb/cloud/admin/tokens/create-token/#manage-tokens-in-the-influxdb-ui + token = "" + organization = "autowarefoundation" + bucket = "autoware" + timeout = "5s" + user_agent = "telegraf" + +[[inputs.cpu]] + percpu = true + totalcpu = true + collect_cpu_time = false + report_active = false + core_tags = false + +[[inputs.disk]] + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + +[[inputs.diskio]] + +[[inputs.kernel]] + +[[inputs.mem]] + +[[inputs.processes]] + +[[inputs.swap]] + +[[inputs.system]] + +[[inputs.exec]] + commands = ["/opt/autoware/bin/get_telegraf_proccpu_json.sh"] + timeout = "10s" + data_format = "json" + name_suffix = "_proccpu" + +[[inputs.exec]] + commands = ["/opt/autoware/bin/get_telegraf_procmem_json.sh"] + timeout = "5s" + data_format = "json" + name_suffix = "_procmem" + +[[inputs.net]] + +[[inputs.nvidia_smi]] diff --git a/ansible/roles/telegraf/handlers/main.yaml b/ansible/roles/telegraf/handlers/main.yaml new file mode 100644 index 00000000000..854c7d99027 --- /dev/null +++ b/ansible/roles/telegraf/handlers/main.yaml @@ -0,0 +1,4 @@ +- name: Restart telegraf + ansible.builtin.systemd: + name: telegraf + state: restarted diff --git a/ansible/roles/telegraf/meta/main.yaml b/ansible/roles/telegraf/meta/main.yaml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ansible/roles/telegraf/tasks/main.yaml b/ansible/roles/telegraf/tasks/main.yaml new file mode 100644 index 00000000000..e6cc9a3170e --- /dev/null +++ b/ansible/roles/telegraf/tasks/main.yaml @@ -0,0 +1,136 @@ +- name: Download influxdata-archive_compat.key + become: true + ansible.builtin.get_url: + url: https://repos.influxdata.com/influxdata-archive_compat.key + dest: /tmp/influxdata-archive_compat.key + mode: 0755 + +- name: Verify the checksum of the key file + ansible.builtin.command: sha256sum -c + args: + chdir: /tmp + stdin: 393e8779c89ac8d958f81f942f9ad7fb82a25e133faddaf92e15b16e6ac9ce4c influxdata-archive_compat.key + register: checksum_result + failed_when: checksum_result.rc != 0 + changed_when: false + +- name: Check if GPG key is already converted + ansible.builtin.stat: + path: /tmp/influxdata-archive_compat.gpg + register: gpg_key_stat + +- name: Convert the key to gpg format + ansible.builtin.command: + cmd: gpg --dearmor -o /tmp/influxdata-archive_compat.gpg /tmp/influxdata-archive_compat.key + become: true + when: not gpg_key_stat.stat.exists + changed_when: false + +- name: Check if GPG key is already moved + ansible.builtin.stat: + path: /etc/apt/trusted.gpg.d/influxdata-archive_compat.gpg + register: moved_key_stat + +- name: Move the gpg key to trusted.gpg.d + ansible.builtin.command: + cmd: mv /tmp/influxdata-archive_compat.gpg /etc/apt/trusted.gpg.d/influxdata-archive_compat.gpg + become: true + when: not moved_key_stat.stat.exists + changed_when: false + +- name: Add InfluxData repository + become: true + ansible.builtin.apt_repository: + repo: deb [signed-by=/etc/apt/trusted.gpg.d/influxdata-archive_compat.gpg] https://repos.influxdata.com/debian stable main + state: present + filename: influxdata.list + +- name: Update apt cache + become: true + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + +- name: Install InfluxDB v2 + become: true + ansible.builtin.apt: + name: influxdb2 + state: present + +- name: Install InfluxDB v2 CLI + become: true + ansible.builtin.apt: + name: influxdb2-cli + state: present + +- name: Install Telegraf + become: true + ansible.builtin.apt: + name: telegraf + state: present + +- name: Create telegraf_binary directory + ansible.builtin.file: + path: /opt/autoware/bin + state: directory + mode: 0755 + become: true + +- name: Copy get_telegraf_proccpu_json.sh to /opt/autoware/bin + become: true + ansible.builtin.copy: + src: "{{ role_path }}/files/get_telegraf_proccpu_json.sh" + dest: /opt/autoware/bin/get_telegraf_proccpu_json.sh + owner: root + group: root + mode: 0755 + +- name: Copy get_telegraf_procmem_json.sh to /opt/autoware/bin + become: true + ansible.builtin.copy: + src: "{{ role_path }}/files/get_telegraf_procmem_json.sh" + dest: /opt/autoware/bin/get_telegraf_procmem_json.sh + owner: root + group: root + mode: 0755 + +- name: Copy telegraf.conf to /opt/autoware/bin + become: true + ansible.builtin.copy: + src: "{{ role_path }}/files/telegraf.conf" + dest: /etc/telegraf/telegraf.conf + owner: root + group: root + mode: 0644 + backup: true + notify: + - Restart telegraf + +- name: Ensure telegraf is running and enabled + become: true + ansible.builtin.systemd: + name: telegraf + enabled: true + state: started + +- name: Ensure InfluxDB service is running + become: true + ansible.builtin.service: + name: influxdb + state: started + enabled: true + +- name: Check if InfluxDB is already set up + become: true + ansible.builtin.command: + cmd: influx setup --bucket-status + register: influx_setup_status + ignore_errors: true + changed_when: false + +- name: Set up influxDB v2 + become: true + ansible.builtin.command: + cmd: influx setup -f --name default --username autoware --password autoware --token autowaretoken --org autowarefoundation --bucket autoware --retention 0 + when: influx_setup_status.rc != 0 + changed_when: false