From e8330a1693677f677c8117e1a28cdde964c739c3 Mon Sep 17 00:00:00 2001 From: shifter Date: Wed, 4 Dec 2024 16:35:22 +0100 Subject: [PATCH 1/4] filterx: Enhanced support for match group references in regexp_subst to allow up to 3 digits. The parser now supports up to 3 digits for match group references. If the reference exceeds this limit, the parser exits and treats it as a 3-digit reference. This change prevents excessive greediness in processing match group references and improves performance by limiting the reference size. Additionally, match groups can now include leading zeros, allowing users to reference match groups with fewer than 3 digits before alphanumeric characters. For example, \100000000 will be parsed as match group 100, but by using a prefix like \00100000000, the parser will correctly identify it as match group 1. Signed-off-by: shifter --- lib/filterx/expr-regexp-subst.c | 91 ++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 24 deletions(-) diff --git a/lib/filterx/expr-regexp-subst.c b/lib/filterx/expr-regexp-subst.c index 59e25798d..4d63ae03f 100644 --- a/lib/filterx/expr-regexp-subst.c +++ b/lib/filterx/expr-regexp-subst.c @@ -35,6 +35,7 @@ #include "filterx/expr-regexp-common.h" #include "compat/pcre.h" #include "scratch-buffers.h" +#include DEFINE_FUNC_FLAG_NAMES(FilterXRegexpSubstFlags, FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT_NAME, @@ -53,6 +54,7 @@ DEFINE_FUNC_FLAG_NAMES(FilterXRegexpSubstFlags, FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME"=(boolean)" \ FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME"=(boolean))" \ +#define FILTERX_FUNC_REGEXP_SUBST_GRP_ID_MAX_DIGITS 3 typedef struct FilterXFuncRegexpSubst_ { @@ -63,43 +65,84 @@ typedef struct FilterXFuncRegexpSubst_ FLAGSET flags; } FilterXFuncRegexpSubst; +static gchar * +_next_matchgrp_ref(gchar *from, gchar **to) +{ + if (from == NULL || *from == '\0') + return NULL; + g_assert(to); + while (*from != '\0') + { + if ((*from == '\\') && isdigit(*(from + 1))) + { + gchar *start = from; + from += 2; + while (isdigit(*from) && from - start <= FILTERX_FUNC_REGEXP_SUBST_GRP_ID_MAX_DIGITS) + { + from++; + } + *to = from; + return start; + } + from++; + } + return NULL; +} + +static gboolean +_parse_machgrp_ref(const gchar *from, const gchar *to, gint *value) +{ + if (!from || !to || !value || from >= to || to > from + 5) + { + return FALSE; + } + + if (*from != '\\') + { + return FALSE; + } + + from++; + *value = 0; + + while (from < to && isdigit(*from)) + { + *value = (*value * 10) + (*from - '0'); + from++; + } + + return from == to; +} + static gboolean _build_replacement_stirng_with_match_groups(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state, GString *replacement_string) { - PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data); g_string_set_size(replacement_string, 0); - const gchar *rep_ptr = self->replacement; - const gchar *last_ptr = rep_ptr; gint num_grps = state->rc; + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data); - while (*rep_ptr) + gchar *pos = self->replacement; + gchar *last = pos; + gchar *close = NULL; + gint idx = -1; + while ((pos = _next_matchgrp_ref(pos, &close)) != NULL) { - if (*rep_ptr == '\\') + if (_parse_machgrp_ref(pos, close, &idx) && (idx < num_grps)) { - rep_ptr++; - if (*rep_ptr >= '1' && *rep_ptr <= '9') + PCRE2_SIZE start = ovector[2 * idx]; + PCRE2_SIZE end = ovector[2 * idx + 1]; + if (start != PCRE2_UNSET) { - gint grp_idx = *rep_ptr - '0'; - if (grp_idx < num_grps) - { - PCRE2_SIZE start = ovector[2 * grp_idx]; - PCRE2_SIZE end = ovector[2 * grp_idx + 1]; - if (start != PCRE2_UNSET) - { - g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr - 1); - last_ptr = rep_ptr + 1; - size_t group_len = end - start; - g_string_append_len(replacement_string, state->lhs_str + start, group_len); - } - } + g_string_append_len(replacement_string, last, pos - last); + last = close; + size_t group_len = end - start; + g_string_append_len(replacement_string, state->lhs_str + start, group_len); } - rep_ptr++; } - else - rep_ptr++; + pos = close; } - g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr); + g_string_append_len(replacement_string, last, pos - last); return TRUE; } From 1fbb4ff426976b791de7f790553027c71b813a64 Mon Sep 17 00:00:00 2001 From: shifter Date: Wed, 4 Dec 2024 16:36:29 +0100 Subject: [PATCH 2/4] filterx: regexp_subst multi-digit match group IDs unit tests Signed-off-by: shifter --- lib/filterx/tests/test_expr_regexp_subst.c | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/lib/filterx/tests/test_expr_regexp_subst.c b/lib/filterx/tests/test_expr_regexp_subst.c index 3270cfadd..a2996d41f 100644 --- a/lib/filterx/tests/test_expr_regexp_subst.c +++ b/lib/filterx/tests/test_expr_regexp_subst.c @@ -350,6 +350,39 @@ Test(filterx_expr_regexp_subst, regexp_subst_group_subst_without_ref) filterx_object_unref(result); } +Test(filterx_expr_regexp_subst, regexp_subst_group_reference_with_multiple_digits) +{ + FilterXFuncRegexpSubstOpts opts = {.groups = TRUE}; + FilterXObject *result = + _sub("(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})", + "\\12-\\11-\\10-\\9\\8\\7\\6\\5\\4\\3\\2\\1", "010203040506070809101112", opts); + cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string))); + const gchar *res = filterx_string_get_value_ref(result, NULL); + cr_assert_str_eq(res, "12-11-10-090807060504030201"); + filterx_object_unref(result); +} + +Test(filterx_expr_regexp_subst, regexp_subst_group_do_not_replace_unknown_ref) +{ + FilterXFuncRegexpSubstOpts opts = {.groups = TRUE}; + FilterXObject *result = _sub("(\\d{2})(\\d{2})(\\d{2})", + "\\3\\20\\1", "010203", opts); + cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string))); + const gchar *res = filterx_string_get_value_ref(result, NULL); + cr_assert_str_eq(res, "03\\2001"); + filterx_object_unref(result); +} + +Test(filterx_expr_regexp_subst, regexp_subst_group_limited_digits_and_zero_prefixes) +{ + FilterXFuncRegexpSubstOpts opts = {.groups = TRUE}; + FilterXObject *result = _sub("(\\w+),(\\w+),(\\w+)", "\\3\\02\\0013.14", "baz,bar,foo", opts); + cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string))); + const gchar *res = filterx_string_get_value_ref(result, NULL); + cr_assert_str_eq(res, "foobarbaz3.14"); + filterx_object_unref(result); +} + static void setup(void) { From 2ba815d554844d1833462cfada1205684d00cac3 Mon Sep 17 00:00:00 2001 From: shifter Date: Wed, 4 Dec 2024 16:44:16 +0100 Subject: [PATCH 3/4] filterx: regexp_subst auto-disable 'groups' when no match groups are present The function now automatically checks the replacement pattern at config parsing time. If no match group references are found in the pattern, it will behave as if the 'groups' option is disabled, disabling match group functionality for performance reasons. By default, match groups are enabled in regexp_subst. However, this behavior can now be explicitly disabled by setting the optional named argument groups=false, as shown: `regexp_subst(target, match_pattern, replace_pattern, groups=false);` Signed-off-by: shifter --- lib/filterx/expr-regexp-subst.c | 15 ++++++++++++--- lib/filterx/tests/test_expr_regexp_subst.c | 4 ++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lib/filterx/expr-regexp-subst.c b/lib/filterx/expr-regexp-subst.c index 4d63ae03f..ba6d2bb95 100644 --- a/lib/filterx/expr-regexp-subst.c +++ b/lib/filterx/expr-regexp-subst.c @@ -160,7 +160,6 @@ _replace_matches(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state) _build_replacement_stirng_with_match_groups(self, state, rep_str); replacement_string = rep_str->str; } - do { ovector = pcre2_get_ovector_pointer(state->match_data); @@ -296,6 +295,13 @@ _extract_optional_flags(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, return TRUE; } +static gboolean +_contains_match_grp_ref(gchar *str) +{ + gchar *close = NULL; + return _next_matchgrp_ref(str, &close) != NULL; +} + static gboolean _extract_subst_args(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, GError **error) { @@ -320,7 +326,9 @@ _extract_subst_args(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, GEr self->replacement = _extract_subst_replacement_arg(args, error); if (!self->replacement) return FALSE; - + // turn off group mode if there is no match grp ref due to it's performance impact + if (!_contains_match_grp_ref(self->replacement)) + set_flag(&self->flags, FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS, FALSE); return TRUE; } @@ -365,7 +373,8 @@ filterx_function_regexp_subst_new(FilterXFunctionArgs *args, GError **error) self->super.super.deinit = _subst_deinit; self->super.super.free_fn = _subst_free; - reset_flags(&self->flags, FLAG_VAL(FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT)); + reset_flags(&self->flags, FLAG_VAL(FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT) | FLAG_VAL( + FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS)); if (!_extract_subst_args(self, args, error) || !filterx_function_args_check(args, error)) goto error; diff --git a/lib/filterx/tests/test_expr_regexp_subst.c b/lib/filterx/tests/test_expr_regexp_subst.c index a2996d41f..ad417744b 100644 --- a/lib/filterx/tests/test_expr_regexp_subst.c +++ b/lib/filterx/tests/test_expr_regexp_subst.c @@ -69,9 +69,9 @@ _build_subst_func(const gchar *pattern, const gchar *repr, const gchar *str, Fil if (opts.utf8) args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME, filterx_literal_new(filterx_boolean_new(TRUE)))); - if (opts.groups) + if (!opts.groups) args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME, - filterx_literal_new(filterx_boolean_new(TRUE)))); + filterx_literal_new(filterx_boolean_new(FALSE)))); GError *err = NULL; FilterXExpr *func = filterx_function_regexp_subst_new(filterx_function_args_new(args, NULL), &err); From 231c51aca649ad361ff7cbd9ec825063ae4856e6 Mon Sep 17 00:00:00 2001 From: shifter Date: Thu, 5 Dec 2024 11:26:52 +0100 Subject: [PATCH 4/4] filterx: regexp_subst light tests Signed-off-by: shifter --- tests/light/functional_tests/filterx/test_filterx.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/light/functional_tests/filterx/test_filterx.py b/tests/light/functional_tests/filterx/test_filterx.py index 33b7b5e45..6ce1469ce 100644 --- a/tests/light/functional_tests/filterx/test_filterx.py +++ b/tests/light/functional_tests/filterx/test_filterx.py @@ -2004,9 +2004,11 @@ def test_regexp_subst(config, syslog_ng): $MSG.orgrp_global = regexp_subst("foobarbaz", "(fo|az)", "!", global=true); $MSG.ignore_case_control = regexp_subst("FoObArBaz", "(o|a)", "!", global=true); $MSG.ignore_case = regexp_subst("FoObArBaz", "(o|a)", "!", ignorecase=true, global=true); - $MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1");; - $MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=true); - $MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz", groups=true); + $MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=false); + $MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1"); + $MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz"); + $MSG.multi_digit_grps = regexp_subst("010203040506070809101112", /(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/, "\\10-\\11-\\12"); + $MSG.prefixing_zeros = regexp_subst("foobar", /^(.*)$/, "\\001012345"); """, ) syslog_ng.start(config) @@ -2028,7 +2030,9 @@ def test_regexp_subst(config, syslog_ng): r""""ignore_case":"F!!b!rB!z",""" r""""groups_off":"\\3-\\2-\\1",""" r""""groups_on":"2022-02-25",""" - r""""mixed_grps":"foo:2022-02-25:bar:baz"}""" + "\n" + r""""mixed_grps":"foo:2022-02-25:bar:baz",""" + r""""multi_digit_grps":"10-11-12",""" + r""""prefixing_zeros":"foobar012345"}""" + "\n" ) assert file_true.read_log() == exp