Skip to content

Commit

Permalink
[PLAT-9857] Install otel collector during universe creation, configur…
Browse files Browse the repository at this point in the history
…e audit logging and basic log export config upload.

Summary:
This diff establishes base logic for opentelemetry collector deployment on the DB nodes.
Also it applies audit logging config via gflags + log export config via otel collector config file.

Test Plan:
Create universe. Otel collector is not installed by default.
Enable yb.universe.otel_collector_enabled key.
Create centos7 universe. Otel collector is installed and configured as system scoped systemd service, but is turned off.
Create alma8 universe. Otel collector is installed and configured as user scoped systemd service, but is turned off.
Configure DataDog telemetry provider via API. Create universe via API with audit logging enabled + log export enabled. Otel collector is installed and running. Logs are sent to DataDog.

Reviewers: svarshney, muthu, #yba-api-review!

Reviewed By: svarshney, muthu

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D28902
  • Loading branch information
anmalysh-yb committed Oct 16, 2023
1 parent e667f96 commit 18291ef
Show file tree
Hide file tree
Showing 36 changed files with 2,156 additions and 54 deletions.
1 change: 1 addition & 0 deletions .arclint
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"(^managed/src/main/resources/metric/Dashboard[.]json$)",
"(^managed/src/main/resources/metric/recording_rules[.]yml$)",
"(^managed/devops/replicated[.]yml$)",
"(^managed/devops/roles/.*$)",
"(^python/yb/py[.]typed$)",
"(^managed/RUNTIME-FLAGS[.]md$)",
"(^[.]clang-tidy)",
Expand Down
7 changes: 7 additions & 0 deletions managed/devops/opscli/ybops/cloud/common/method.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,9 @@ def add_extra_args(self):
help="Comma-separated LUN indexes for mounted on instance disks.")
self.parser.add_argument("--install_locales", action="store_true", default=False,
help="If enabled YBA will install locale on the DB nodes")
self.parser.add_argument("--install_otel_collector", action="store_true")
self.parser.add_argument('--otel_col_config_file', default=None,
help="Path to OpenTelemetry Collector config file.")

def callback(self, args):
host_info = self.cloud.get_host_info(args)
Expand Down Expand Up @@ -831,6 +834,10 @@ def callback(self, args):
self.extra_vars.update({"configure_ybc": args.configure_ybc})
self.extra_vars["device_names"] = self.cloud.get_device_names(args)
self.extra_vars["lun_indexes"] = args.lun_indexes
if args.install_otel_collector:
self.extra_vars.update({"install_otel_collector": args.install_otel_collector})
if args.otel_col_config_file:
self.extra_vars.update({"otel_col_config_file_local": args.otel_col_config_file})

if wait_for_server(self.extra_vars):
self.cloud.setup_ansible(args).run("yb-server-provision.yml",
Expand Down
16 changes: 16 additions & 0 deletions managed/devops/roles/install_otel_collector/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright 2023 YugaByte, Inc. and Contributors
#
# Licensed under the Polyform Free Trial License 1.0.0 (the "License"); you
# may not use this file except in compliance with the License. You
# may obtain a copy of the License at
#
# https://github.com/YugaByte/yugabyte-db/blob/master/licenses/POLYFORM-FREE-TRIAL-LICENSE-1.0.0.txt

yb_home_dir: "/home/{{ user_name }}"
otel_col_dir: "{{ yb_home_dir }}/otel-collector"
otel_col_config_file: "{{ yb_home_dir }}/otel-collector/config.yml"
otel_col_logs_dir: "{{ yb_home_dir }}/otel-collector/logs"
otel_col_psq_dir: "{{ yb_home_dir }}/otel-collector/psq"
otel_col_version: "0.84.0"
otel_col_platform: "linux"
otel_col_arch: "{{ 'arm64' if ansible_architecture == 'aarch64' else 'amd64'}}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
- name: Install OpenTelemetry collector | Set paths part 1
set_fact:
otel_col_temp_path: "{{ remote_tmp_dir | default('/tmp') }}/otel-collector"
otel_col_package_file: "otelcol-contrib_{{ otel_col_version }}_{{ otel_col_platform }}_{{ otel_col_arch }}.tar.gz"
_mount_points: "{{ mount_points.split(',') }}"

- name: Install OpenTelemetry collector | Set paths part 2
set_fact:
otel_col_local_path: "{{ local_package_path }}/{{ otel_col_package_file }}"
_mount_logs_dir: "{{ _mount_points[0] }}/otel-collector/logs"
_mount_psq_dir: "{{ _mount_points[0] }}/otel-collector/psq"

- name: Install OpenTelemetry collector | Clean up OpenTelemetry collector temp directory
file:
path: "{{ otel_col_temp_path }}"
state: absent

- name: Install OpenTelemetry collector | Create OpenTelemetry collector temp directory
file:
path: "{{ otel_col_temp_path }}"
state: directory
mode: 0755

- name: Install OpenTelemetry collector | Download and uncompress OpenTelemetry collector
unarchive:
src: "{{ otel_col_local_path }}"
dest: "{{ otel_col_temp_path }}"
copy: yes

- name: Install OpenTelemetry collector | Create OpenTelemetry collector directory
file:
path: "{{ otel_col_dir }}"
state: directory
mode: 0755

- name: Install OpenTelemetry collector | Copy unpacked collector files
copy:
src: "{{ otel_col_temp_path }}/"
dest: "{{ otel_col_dir }}"
remote_src: True

- name: Install OpenTelemetry collector | Ensure otelcol-contrib permissions.
shell: |
chmod -R 755 {{ otel_col_dir }}/otelcol-contrib
- name: Install OpenTelemetry collector | Clean up OpenTelemetry collector temp directory
file:
path: "{{ otel_col_temp_path }}"
state: absent

- name: Install OpenTelemetry collector | Create OpenTelemetry collector logs directory
file:
path: "{{ _mount_logs_dir }}"
state: directory
mode: 0755

- name: Install OpenTelemetry collector | Symlink OpenTelemetry collector logs directory
file:
src: "{{ _mount_logs_dir }}"
dest: "{{ otel_col_logs_dir }}"
mode: 0755
state: link
force: yes

- name: Install OpenTelemetry collector | Create OpenTelemetry collector persistent queues directory
file:
path: "{{ _mount_psq_dir }}"
state: directory
mode: 0755

- name: Install OpenTelemetry collector | Symlink OpenTelemetry collector persistent queues directory
file:
src: "{{ _mount_psq_dir }}"
dest: "{{ otel_col_psq_dir }}"
mode: 0755
state: link
force: yes
19 changes: 19 additions & 0 deletions managed/devops/roles/install_otel_collector/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
- assert:
that:
- user_name is defined
- mount_points is defined

- name: Install OpenTelemetry collector
include: install-otel-col.yml
become: yes
become_method: sudo
become_user: "{{ user_name }}"

- name: Create OpenTelemetry collector service
include: otel-col-service.yml

- name: Control OpenTelemetry collector service
include: otel-col-control.yml
become: yes
become_method: sudo
become_user: "{{ user_name }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
- block:
- name: Control OpenTelemetry collector | Determine system or user scope
stat:
path: /etc/systemd/system/otel-collector.service
register: systemd_system

- name: Control OpenTelemetry collector | User mode systemd | Stop existing running OpenTelemetry Collector
systemd:
name: otel-collector
state: stopped
enabled: no
scope: user
when: not systemd_system.stat.exists

- name: Control OpenTelemetry collector | System mode systemd | Stop and disable OpenTelemetry collector
block:
- name: Control OpenTelemetry collector | System mode systemd | Stop existing running OpenTelemetry collector
shell:
cmd: "sudo systemctl stop otel-collector"

- name: Control OpenTelemetry collector | System mode systemd | Stop existing running OpenTelemetry collector
shell:
cmd: "sudo systemctl disable otel-collector"
when: systemd_system.stat.exists

- name: Control OpenTelemetry collector | Remove existing OpenTelemetry collector conf file
file:
path: "{{ otel_col_config_file }}"
state: absent

- name: Control OpenTelemetry collector | Download new OpenTelemetry collector conf file
copy:
src: "{{ otel_col_config_file_local }}"
dest: "{{ otel_col_config_file }}"
when: otel_col_config_file_local is defined

- name: Control OpenTelemetry collector | User mode systemd | Enable and start OpenTelemetry collector
systemd:
name: otel-collector
state: started
enabled: yes
scope: user
when: not systemd_system.stat.exists and otel_col_config_file_local is defined

- name: Control OpenTelemetry collector | System mode systemd | Enable and start OpenTelemetry collector
block:
- name: Control OpenTelemetry collector | System mode systemd | Enable OpenTelemetry collector service
shell:
cmd: "sudo systemctl enable otel-collector"

- name: Control OpenTelemetry collector | System mode systemd | Run otel-collector.service to start the OpenTelemetry collector
shell:
cmd: "sudo systemctl start otel-collector"
when: systemd_system.stat.exists and otel_col_config_file_local is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
- block:
- set_fact:
systemd_dir: "/etc/systemd/system"

- block:
- set_fact:
systemd_dir: "{{ yb_home_dir }}/.config/systemd/user"

- name: Add user systemd directory
file:
path: "{{ systemd_dir }}"
state: directory
owner: "{{ user_name }}"
group: "{{ user_name }}"
mode: 0744
when: ansible_os_family != 'RedHat' or (ansible_distribution_major_version != '7' and not (ansible_distribution == 'Amazon' and ansible_distribution_major_version == '2'))

# Give yugabyte user sudo access to start/stop/restart otel-collector
- name: OpenTelemetry Collector Service | Add yugabyte sudo user permissions for otel service controls without password
lineinfile:
dest: /etc/sudoers.d/yugabyte-otel-col
line: 'yugabyte ALL=(ALL:ALL) NOPASSWD: /bin/systemctl start otel-collector,
/bin/systemctl stop otel-collector,
/bin/systemctl restart otel-collector,
/bin/systemctl enable otel-collector,
/bin/systemctl disable otel-collector,
/bin/systemctl start otel-collector'
create: yes
state: present
mode: 0440
validate: 'visudo -cf %s'

# Adding systemd service file for otel-collector
- name: OpenTelemetry Collector Service | Add otel-collector.service
template:
src: otel-collector.service
dest: "{{ systemd_dir }}/otel-collector.service"
owner: "{{ user_name }}"
group: "{{ user_name }}"
mode: 0644

- name: OpenTelemetry Collector Service | System mode systemd | Perform daemon-reload for the new services
shell:
cmd: "sudo systemctl daemon-reload"
when: systemd_dir == "/etc/systemd/system"

- name: OpenTelemetry Collector Service | User mode systemd | Reload systemd user daemon
systemd:
daemon_reload: yes
scope: user
when: systemd_dir != "/etc/systemd/system"
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
[Unit]
Description=OpenTelemetry Collector
Requires=network-online.target
After=network.target network-online.target multi-user.target
StartLimitInterval=100
StartLimitBurst=10

[Path]
PathExists={{yb_home_dir}}/otel-collector/otelcol-contrib
PathExists={{yb_home_dir}}/otel-collector/config.yml

[Service]
{% if ansible_os_family == 'RedHat' and (ansible_distribution_major_version == '7' or (ansible_distribution == 'Amazon' and ansible_distribution_major_version == '2')) %}
User={{ user_name }}
Group={{ user_name }}
{% endif %}
# Start
ExecStart={{yb_home_dir}}/otel-collector/otelcol-contrib \
--config=file:{{yb_home_dir}}/otel-collector/config.yml
Restart=always
RestartSec=5
# Stop -> SIGTERM - 10s - SIGKILL (if not stopped)
KillMode=process
TimeoutStopFailureMode=terminate
KillSignal=SIGTERM
TimeoutStopSec=10
FinalKillSignal=SIGKILL
# Logs
StandardOutput=syslog
StandardError=syslog
# ulimit
LimitCORE=infinity
LimitNOFILE=1048576
LimitNPROC=12000

[Install]
WantedBy=default.target
3 changes: 3 additions & 0 deletions managed/devops/roles/provision-cluster-server/meta/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ dependencies:
- node_exporter
tags: yb-prebuilt-ami

- role: install_otel_collector
when: install_otel_collector is defined and install_otel_collector|bool

- role: install_backup_util
when: ansible_architecture != "aarch64"
util:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
new NodeInfo()
.setNodeHost(nodeDetails.cloudInfo.private_ip)
.setNodeName(nodeDetails.nodeName)
.setNodeUuid(nodeDetails.nodeUuid)
.setNodeIdentifier(
nodeInstance != null ? nodeInstance.getDetails().instanceName : "")
.setYbSoftwareVersion(userIntent.ybSoftwareVersion)
Expand Down Expand Up @@ -975,8 +976,8 @@ private String generateCollectMetricsScript(UUID universeUuid, NodeInfo nodeInfo
// they are added.
Path path =
fileHelperService.createTempFile(
"collect_metrics_" + universeUuid + "_" + nodeInfo.nodeName, ".sh");
Files.write(path, scriptContent.getBytes(StandardCharsets.UTF_8));
"collect_metrics_" + universeUuid + "_" + nodeInfo.nodeUuid, ".sh");
Files.writeString(path, scriptContent);

return path.toString();
} catch (IOException e) {
Expand All @@ -997,8 +998,8 @@ private String generateNodeCheckScript(UUID universeUuid, NodeInfo nodeInfo) {
// they are added.
Path path =
fileHelperService.createTempFile(
"node_health_" + universeUuid + "_" + nodeInfo.nodeName, ".py");
Files.write(path, scriptContent.getBytes(StandardCharsets.UTF_8));
"node_health_" + universeUuid + "_" + nodeInfo.nodeUuid, ".py");
Files.writeString(path, scriptContent);

return path.toString();
} catch (IOException e) {
Expand Down Expand Up @@ -1054,6 +1055,7 @@ public static class NodeInfo {
private String ybcDir = "";
private String nodeHost;
private String nodeName;
private UUID nodeUuid;
private String nodeIdentifier = "";
private String ybSoftwareVersion = null;
private boolean enableTls = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,8 @@ protected void fillSetupParamsForNode(
// Whether to install node_exporter on nodes or not.
params.extraDependencies.installNodeExporter =
taskParams().extraDependencies.installNodeExporter;
// Whether to install OpenTelemetry Collector on nodes or not.
params.otelCollectorEnabled = taskParams().otelCollectorEnabled;
// Which user the node exporter service will run as
params.nodeExporterUser = taskParams().nodeExporterUser;
// Development testing variable.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright (c) YugaByte, Inc.

package com.yugabyte.yw.commissioner.tasks.upgrade;

import com.yugabyte.yw.commissioner.BaseTaskDependencies;
import com.yugabyte.yw.commissioner.UpgradeTaskBase;
import com.yugabyte.yw.commissioner.UserTaskDetails.SubTaskGroupType;
import com.yugabyte.yw.forms.AuditLogConfigParams;
import com.yugabyte.yw.models.helpers.NodeDetails.NodeState;
import javax.inject.Inject;
import lombok.EqualsAndHashCode;
import lombok.extern.slf4j.Slf4j;

@Slf4j
@EqualsAndHashCode(callSuper = false)
public class ModifyAuditLoggingConfig extends UpgradeTaskBase {

@Inject
protected ModifyAuditLoggingConfig(BaseTaskDependencies baseTaskDependencies) {
super(baseTaskDependencies);
}

@Override
protected AuditLogConfigParams taskParams() {
return (AuditLogConfigParams) taskParams;
}

@Override
public SubTaskGroupType getTaskSubGroupType() {
return SubTaskGroupType.Provisioning;
}

@Override
public NodeState getNodeState() {
return NodeState.Reprovisioning;
}

@Override
public void run() {}

// this class need to implement task to update gflag,
// update otel config + restart otel collector
}
Loading

0 comments on commit 18291ef

Please sign in to comment.