Skip to content

Commit

Permalink
Merge branch 'prometheus-community:master' into allamiro-dev
Browse files Browse the repository at this point in the history
  • Loading branch information
allamiro authored Nov 16, 2024
2 parents bdffbf2 + 39b36d0 commit deaa777
Show file tree
Hide file tree
Showing 10 changed files with 267 additions and 63 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ jobs:
flake8:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
python-version: '3.12'
- uses: py-actions/flake8@v2

shellcheck:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: ludeeus/action-shellcheck@master
64 changes: 34 additions & 30 deletions apt_info.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
#!/usr/bin/env python3
#
# Description: Expose metrics from apt. This is inspired by and
# intended to be a replacement for the original apt.sh.
#
# This script deliberately does *not* update the apt cache. You need
# something else to run `apt update` regularly for the metrics to be
# up to date. This can be done in numerous ways, but the canonical way
# is to use the normal `APT::Periodic::Update-Package-Lists`
# setting.
#
# This, for example, will enable a nightly job that runs `apt update`:
#
# echo 'APT::Periodic::Update-Package-Lists "1";' > /etc/apt/apt.conf.d/99_auto_apt_update.conf
#
# See /usr/lib/apt/apt.systemd.daily for details.
#
# Dependencies: python3-apt, python3-prometheus-client
#
# Authors: Kyle Fazzari <kyrofa@ubuntu.com>
# Daniel Swarbrick <dswarbrick@debian.org>

"""
Description: Expose metrics from apt. This is inspired by and
intended to be a replacement for the original apt.sh.
This script deliberately does *not* update the apt cache. You need
something else to run `apt update` regularly for the metrics to be
up to date. This can be done in numerous ways, but the canonical way
is to use the normal `APT::Periodic::Update-Package-Lists`
setting.
This, for example, will enable a nightly job that runs `apt update`:
echo 'APT::Periodic::Update-Package-Lists "1";' > /etc/apt/apt.conf.d/99_auto_apt_update.conf
See /usr/lib/apt/apt.systemd.daily for details.
Dependencies: python3-apt, python3-prometheus-client
Authors: Kyle Fazzari <kyrofa@ubuntu.com>
Daniel Swarbrick <dswarbrick@debian.org>
"""

import apt
import apt_pkg
Expand Down Expand Up @@ -52,12 +54,8 @@ def _convert_candidates_to_upgrade_infos(candidates):


def _write_pending_upgrades(registry, cache):
# Discount any changes that apply to packages that aren't installed (e.g.
# count an upgrade to package A that adds a new dependency on package B as
# only one upgrade, not two). See the following issue for more details:
# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/issues/85
candidates = {
p.candidate for p in cache.get_changes() if p.is_installed and p.marked_upgrade
p.candidate for p in cache if p.is_upgradable
}
upgrade_list = _convert_candidates_to_upgrade_infos(candidates)

Expand All @@ -69,7 +67,10 @@ def _write_pending_upgrades(registry, cache):


def _write_held_upgrades(registry, cache):
held_candidates = {p.candidate for p in cache if p.is_upgradable and p.marked_keep}
held_candidates = {
p.candidate for p in cache
if p.is_upgradable and p._pkg.selected_state == apt_pkg.SELSTATE_HOLD
}
upgrade_list = _convert_candidates_to_upgrade_infos(held_candidates)

if upgrade_list:
Expand All @@ -89,13 +90,16 @@ def _write_autoremove_pending(registry, cache):
def _write_cache_timestamps(registry):
g = Gauge('apt_package_cache_timestamp_seconds', "Apt update last run time.", registry=registry)
apt_pkg.init_config()
if apt_pkg.config.find_b("APT::Periodic::Update-Package-Lists"):
if (
apt_pkg.config.find_b("APT::Periodic::Update-Package-Lists") and
os.path.isfile("/var/lib/apt/periodic/update-success-stamp")
):
# if we run updates automatically with APT::Periodic, we can
# check this timestamp file
# check this timestamp file if it exists
stamp_file = "/var/lib/apt/periodic/update-success-stamp"
else:
# if not, let's just fallback on the lists directory
stamp_file = '/var/lib/apt/lists'
# if not, let's just fallback on the partial file of the lists directory
stamp_file = '/var/lib/apt/lists/partial'
try:
g.set(os.stat(stamp_file).st_mtime)
except OSError:
Expand Down
5 changes: 4 additions & 1 deletion btrfs_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
from prometheus_client import CollectorRegistry, Gauge, generate_latest


DEVICE_PATTERN = re.compile(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$")


def get_btrfs_mount_points():
"""List all btrfs mount points.
Expand Down Expand Up @@ -47,7 +50,7 @@ def get_btrfs_errors(mountpoint):
continue
# Sample line:
# [/dev/vdb1].flush_io_errs 0
m = re.search(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$", line.decode("utf-8"))
m = DEVICE_PATTERN.match(line.decode("utf-8"))
if not m:
raise RuntimeError("unexpected output from btrfs: '%s'" % line)
yield m.group(1), m.group(2), int(m.group(3))
Expand Down
2 changes: 1 addition & 1 deletion directory-size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
echo "# HELP node_directory_size_bytes Disk space used by some directories"
echo "# TYPE node_directory_size_bytes gauge"
du --block-size=1 --summarize "$@" \
| sed -ne 's/\\/\\\\/;s/"/\\"/g;s/^\([0-9]\+\)\t\(.*\)$/node_directory_size_bytes{directory="\2"} \1/p'
| awk '{ print "node_directory_size_bytes{directory=\"" $2 "\"} " $1 }'
172 changes: 172 additions & 0 deletions needrestart_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/usr/bin/env python3

"""
Description: Expose metrics from needrestart.
This script runs needrestart in batch mode. It will never ask for input
and will never restart or upgrade anything.
Dependencies: python >= 3.5, python3-prometheus-client, needrestart
Authors: RomainMou
"""

import sys
import time
import subprocess
from collections import Counter
from enum import Enum

from prometheus_client import (
CollectorRegistry,
Gauge,
generate_latest,
)


class KernelStatus(Enum):
UNKNOWN = 0
CURRENT = 1
ABI_UPGRADE = 2
VERSION_UPGRADE = 3


class MicroCodeStatus(Enum):
UNKNOWN = 0
CURRENT = 1
OBSOLETE = 2


class NeedRestartData:
def __init__(self, needrestart_output):
# Some default value
self.timestamp = int(time.time())
self.version = None
self.kernel_status = None
self.microcode_status = None
self.kernel_current_version = ""
self.kernel_expected_version = ""
self.microcode_current_version = ""
self.microcode_expected_version = ""
needrestart_counter = Counter()

# Parse the cmd output
for line in needrestart_output.splitlines():
key, value = line.split(": ", maxsplit=1)
if key == "NEEDRESTART-VER":
self.version = value
# Kernel informations
elif key == "NEEDRESTART-KCUR":
self.kernel_current_version = value
elif key == "NEEDRESTART-KEXP":
self.kernel_expected_version = value
elif key == "NEEDRESTART-KSTA":
self.kernel_status = KernelStatus(int(value))
# Microcode informations
elif key == "NEEDRESTART-UCCUR":
self.microcode_current_version = value
elif key == "NEEDRESTART-UCEXP":
self.microcode_expected_version = value
elif key == "NEEDRESTART-UCSTA":
self.microcode_status = MicroCodeStatus(int(value))
# Count the others
else:
needrestart_counter.update({key})

self.services_count = needrestart_counter["NEEDRESTART-SVC"]
self.containers_count = needrestart_counter["NEEDRESTART-CONT"]
self.sessions_count = needrestart_counter["NEEDRESTART-SESS"]


def write_timestamp(registry, needrestart_data):
g = Gauge(
"needrestart_timestamp_seconds",
"information about the version and when it was last run",
labelnames=["version"],
registry=registry,
)
g.labels(needrestart_data.version).set(needrestart_data.timestamp)


def write_kernel(registry, needrestart_data):
if needrestart_data.kernel_status:
e = Gauge(
"needrestart_kernel_status_info",
"information about the kernel status",
labelnames=["current", "expected"],
registry=registry,
)
e.labels(
needrestart_data.kernel_current_version,
needrestart_data.kernel_expected_version,
).set(needrestart_data.kernel_status.value)


def write_microcode(registry, needrestart_data):
if needrestart_data.microcode_status:
e = Gauge(
"needrestart_microcode_status_info",
"information about the microcode status",
labelnames=["current", "expected"],
registry=registry,
)
e.labels(
needrestart_data.microcode_current_version,
needrestart_data.microcode_expected_version,
).set(needrestart_data.microcode_status.value)


def write_services(registry, needrestart_data):
g = Gauge(
"needrestart_services_total",
"number of services requiring a restart",
registry=registry,
)
g.set(needrestart_data.services_count)


def write_containers(registry, needrestart_data):
g = Gauge(
"needrestart_containers_total",
"number of containers requiring a restart",
registry=registry,
)
g.set(needrestart_data.containers_count)


def write_sessions(registry, needrestart_data):
g = Gauge(
"needrestart_sessions_total",
"number of sessions requiring a restart",
registry=registry,
)
g.set(needrestart_data.sessions_count)


def main():
registry = CollectorRegistry()

try:
needrestart_output = subprocess.run(
["needrestart", "-b"], capture_output=True, text=True, check=True
).stdout
needrestart_data = NeedRestartData(needrestart_output)
except subprocess.CalledProcessError as e:
print(f"Error executing needrestart:\n{e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"An unexpected error occurred:\n{e}", file=sys.stderr)
sys.exit(1)

write_timestamp(registry, needrestart_data)
write_kernel(registry, needrestart_data)
write_microcode(registry, needrestart_data)
write_services(registry, needrestart_data)
write_containers(registry, needrestart_data)
write_sessions(registry, needrestart_data)

print(generate_latest(registry).decode(), end="")


if __name__ == "__main__":
main()
6 changes: 3 additions & 3 deletions ntpd_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from prometheus_client import CollectorRegistry, Gauge, generate_latest

# NTP peers status, with no DNS lookups.
ntpq_cmd = ['ntpq', '-np']
ntpq_cmd = ['ntpq', '-np', '-W', '255']
ntpq_rv_cmd = ['ntpq', '-c', 'rv 0 offset,sys_jitter,rootdisp,rootdelay']

# Regex to match all of the fields in the output of ntpq -np
metrics_fields = [
r'^(?P<status>.)(?P<remote>[\w\.]+)',
r'(?P<refid>[\w\.]+)',
r'^(?P<status>.)(?P<remote>[\w\.:]+)',
r'(?P<refid>[\w\.:]+)',
r'(?P<stratum>\d+)',
r'(?P<type>\w)',
r'(?P<when>\d+)',
Expand Down
7 changes: 7 additions & 0 deletions smartmon.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,13 @@
'host_writes_mib',
'host_writes_32mib',
'load_cycle_count',
'lifetime_writes_gib',
'media_wearout_indicator',
'percent_lifetime_remain',
'wear_leveling_count',
'nand_writes_1gib',
'offline_uncorrectable',
'percent_lifetime_remain',
'power_cycle_count',
'power_on_hours',
'program_fail_count',
Expand All @@ -60,10 +63,14 @@
'temperature_case',
'temperature_celsius',
'temperature_internal',
'total_bad_block',
'total_lbas_read',
'total_lbas_written',
'total_writes_gib',
'total_reads_gib',
'udma_crc_error_count',
'unsafe_shutdown_count',
'unexpect_power_loss_ct',
'workld_host_reads_perc',
'workld_media_wear_indic',
'workload_minutes',
Expand Down
8 changes: 5 additions & 3 deletions smartmon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ load_cycle_count
media_wearout_indicator
nand_writes_1gib
offline_uncorrectable
percent_lifetime_remain
power_cycle_count
power_on_hours
program_fail_cnt_total
Expand Down Expand Up @@ -166,12 +167,13 @@ format_output() {
awk -F'{' "${output_format_awk}"
}

smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
smartctl_version="$(/usr/sbin/smartctl -V | awk 'NR==1 && $1 == "smartctl" {print $2}')"

echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output

if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then
exit
# Exit if "smartctl" version is lower 6
if [[ ${smartctl_version%.*} -lt 6 ]]; then
exit 0
fi

device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')"
Expand Down
Loading

0 comments on commit deaa777

Please sign in to comment.