From d0d8f84261fc007ed4b44ce6fef3893c432de219 Mon Sep 17 00:00:00 2001 From: shifter Date: Wed, 27 Nov 2024 02:58:35 +0100 Subject: [PATCH 1/4] filterx: enhanced regexp_subst to support match group identifiers in patterns Since this feature could have a significant impact on performance, it is controlled by a flag called 'groups', which is set to false by default. To enable this feature, add the optional groups=true argument to your regexp_subst call. Signed-off-by: shifter --- lib/filterx/expr-regexp.c | 53 ++++++++++++++++++++++++++++++++++++--- lib/filterx/expr-regexp.h | 2 ++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/lib/filterx/expr-regexp.c b/lib/filterx/expr-regexp.c index cb8df99ca..c8ddb28b4 100644 --- a/lib/filterx/expr-regexp.c +++ b/lib/filterx/expr-regexp.c @@ -38,7 +38,8 @@ FILTERX_FUNC_REGEXP_SUBST_FLAG_GLOBAL_NAME"=(boolean) " \ FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME"=(boolean) " \ FILTERX_FUNC_REGEXP_SUBST_FLAG_IGNORECASE_NAME"=(boolean) " \ - FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME"=(boolean))" \ + FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME"=(boolean)" \ + FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME"=(boolean))" \ #define FILTERX_FUNC_REGEXP_SEARCH_USAGE "Usage: regexp_search(string, pattern)" @@ -543,18 +544,61 @@ _is_zero_length_match(PCRE2_SIZE *ovector) return ovector[0] == ovector[1]; } +static gboolean +_build_replacement_stirng_with_match_groups(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state, + GString *replacement_string) +{ + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data); + g_string_set_size(replacement_string, 0); + const gchar *rep_ptr = self->replacement; + + while (*rep_ptr) + { + if (*rep_ptr == '\\') + { + rep_ptr++; + if (*rep_ptr >= '1' && *rep_ptr <= '9') + { + gint grp_num = *rep_ptr - '0'; + PCRE2_SIZE start = ovector[2 * grp_num]; + PCRE2_SIZE end = ovector[2 * grp_num + 1]; + if (start != PCRE2_UNSET) + { + size_t group_len = end - start; + g_string_append_len_inline(replacement_string, state->lhs_str + start, group_len); + } + } + rep_ptr++; + } + else + { + g_string_append_c_inline(replacement_string, *rep_ptr++); + } + } + return TRUE; +} + static FilterXObject * _replace_matches(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state) { GString *new_value = scratch_buffers_alloc(); PCRE2_SIZE *ovector = NULL; gint pos = 0; + const gchar *replacement_string = self->replacement; + + if (self->opts.groups) + { + GString *rep_str = scratch_buffers_alloc(); + _build_replacement_stirng_with_match_groups(self, state, rep_str); + replacement_string = rep_str->str; + } + do { ovector = pcre2_get_ovector_pointer(state->match_data); g_string_append_len(new_value, state->lhs_str + pos, _start_offset(ovector) - pos); - g_string_append(new_value, self->replacement); + g_string_append(new_value, replacement_string); if (_is_zero_length_match(ovector)) { @@ -574,7 +618,7 @@ _replace_matches(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state) // handle the very last of zero lenght matches if (_is_zero_length_match(ovector)) - g_string_append(new_value, self->replacement); + g_string_append(new_value, replacement_string); return filterx_string_new(new_value->str, new_value->len); } @@ -689,6 +733,9 @@ _extract_optional_flags(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, if (!_extract_literal_bool(args, FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME, &self->opts.utf8, error)) return FALSE; + if (!_extract_literal_bool(args, FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME, + &self->opts.groups, error)) + return FALSE; return TRUE; } diff --git a/lib/filterx/expr-regexp.h b/lib/filterx/expr-regexp.h index 0e57b7db9..4d5c43d13 100644 --- a/lib/filterx/expr-regexp.h +++ b/lib/filterx/expr-regexp.h @@ -33,6 +33,7 @@ #define FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME "utf8" #define FILTERX_FUNC_REGEXP_SUBST_FLAG_IGNORECASE_NAME "ignorecase" #define FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME "newline" +#define FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME "groups" typedef struct FilterXFuncRegexpSubstOpts_ { @@ -41,6 +42,7 @@ typedef struct FilterXFuncRegexpSubstOpts_ gboolean utf8; gboolean ignorecase; gboolean newline; + gboolean groups; } FilterXFuncRegexpSubstOpts; FilterXExpr *filterx_expr_regexp_match_new(FilterXExpr *lhs, const gchar *pattern); From 18e31ef20358a6f0a761a4bd2b0fa6ae429a9dd9 Mon Sep 17 00:00:00 2001 From: shifter Date: Wed, 27 Nov 2024 02:59:13 +0100 Subject: [PATCH 2/4] filterx: regexp_subst match group handling unit tests Signed-off-by: shifter --- lib/filterx/tests/test_expr_regexp.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/filterx/tests/test_expr_regexp.c b/lib/filterx/tests/test_expr_regexp.c index 6fab5a51f..d4c38f021 100644 --- a/lib/filterx/tests/test_expr_regexp.c +++ b/lib/filterx/tests/test_expr_regexp.c @@ -276,6 +276,9 @@ _build_subst_func(const gchar *pattern, const gchar *repr, const gchar *str, Fil if (opts.utf8) args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME, filterx_literal_new(filterx_boolean_new(TRUE)))); + if (opts.groups) + args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME, + filterx_literal_new(filterx_boolean_new(TRUE)))); GError *err = NULL; FilterXExpr *func = filterx_function_regexp_subst_new(filterx_function_args_new(args, NULL), &err); @@ -527,6 +530,23 @@ Test(filterx_expr_regexp, regexp_subst_match_opt_ignorecase_nojit) filterx_object_unref(result_alt); } +Test(filterx_expr_regexp, regexp_subst_group_subst) +{ + FilterXFuncRegexpSubstOpts opts = {}; + FilterXObject *result = _sub("(\\d{2})-(\\d{2})-(\\d{4})", "\\3-\\2-\\1", "25-02-2022", opts); + cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string))); + const gchar *res = filterx_string_get_value_ref(result, NULL); + cr_assert_str_eq(res, "\\3-\\2-\\1"); + filterx_object_unref(result); + + FilterXFuncRegexpSubstOpts opts_alt = {.groups = TRUE}; + FilterXObject *result_alt = _sub("(\\d{2})-(\\d{2})-(\\d{4})", "\\3-\\2-\\1", "25-02-2022", opts_alt); + cr_assert(filterx_object_is_type(result_alt, &FILTERX_TYPE_NAME(string))); + const gchar *res_alt = filterx_string_get_value_ref(result_alt, NULL); + cr_assert_str_eq(res_alt, "2022-02-25"); + filterx_object_unref(result_alt); +} + static void setup(void) { From 5535115b66f1150c1272a4c6b6b7fd86970f6504 Mon Sep 17 00:00:00 2001 From: shifter Date: Wed, 27 Nov 2024 03:00:08 +0100 Subject: [PATCH 3/4] filterx: light tests for regexp_subst match group feature Signed-off-by: shifter --- tests/light/functional_tests/filterx/test_filterx.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/light/functional_tests/filterx/test_filterx.py b/tests/light/functional_tests/filterx/test_filterx.py index 53f521f3b..e65b3ae13 100644 --- a/tests/light/functional_tests/filterx/test_filterx.py +++ b/tests/light/functional_tests/filterx/test_filterx.py @@ -1938,6 +1938,9 @@ def test_regexp_subst(config, syslog_ng): $MSG.orgrp_global = regexp_subst("foobarbaz", "(fo|az)", "!", global=true); $MSG.ignore_case_control = regexp_subst("FoObArBaz", "(o|a)", "!", global=true); $MSG.ignore_case = regexp_subst("FoObArBaz", "(o|a)", "!", ignorecase=true, global=true); + $MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1");; + $MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=true); + $MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz", groups=true); """, ) syslog_ng.start(config) @@ -1956,7 +1959,10 @@ def test_regexp_subst(config, syslog_ng): r""""zero_length_match_global":"!f!o!o!b!a!r!b!a!z!",""" r""""orgrp_global":"!obarb!",""" r""""ignore_case_control":"F!ObArB!z",""" - r""""ignore_case":"F!!b!rB!z"}""" + "\n" + r""""ignore_case":"F!!b!rB!z",""" + r""""groups_off":"\\3-\\2-\\1",""" + r""""groups_on":"2022-02-25",""" + r""""mixed_grps":"foo:2022-02-25:bar:baz"}""" + "\n" ) assert file_true.read_log() == exp From a805102dd844932e9b51446f9200065104b29152 Mon Sep 17 00:00:00 2001 From: shifter Date: Thu, 28 Nov 2024 09:12:29 +0100 Subject: [PATCH 4/4] filterx: performance and safe code fix for regex_subst match groups + additional unit test for enabled but unused 'group' feature Signed-off-by: shifter --- lib/filterx/expr-regexp.c | 26 +++++++++++++++++--------- lib/filterx/tests/test_expr_regexp.c | 10 ++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/lib/filterx/expr-regexp.c b/lib/filterx/expr-regexp.c index c8ddb28b4..fe88776b9 100644 --- a/lib/filterx/expr-regexp.c +++ b/lib/filterx/expr-regexp.c @@ -49,6 +49,7 @@ typedef struct FilterXReMatchState_ FilterXObject *lhs_obj; const gchar *lhs_str; gsize lhs_str_len; + gint rc; } FilterXReMatchState; static void @@ -113,6 +114,7 @@ _match_inner(FilterXReMatchState *state, pcre2_code_8 *pattern, gint start_offse gint rc = pcre2_match(pattern, (PCRE2_SPTR) state->lhs_str, (PCRE2_SIZE) state->lhs_str_len, (PCRE2_SIZE) start_offset, 0, state->match_data, NULL); + state->rc = rc; if (rc < 0) { switch (rc) @@ -551,6 +553,8 @@ _build_replacement_stirng_with_match_groups(const FilterXFuncRegexpSubst *self, PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data); g_string_set_size(replacement_string, 0); const gchar *rep_ptr = self->replacement; + const gchar *last_ptr = rep_ptr; + gint num_grps = state->rc; while (*rep_ptr) { @@ -559,22 +563,26 @@ _build_replacement_stirng_with_match_groups(const FilterXFuncRegexpSubst *self, rep_ptr++; if (*rep_ptr >= '1' && *rep_ptr <= '9') { - gint grp_num = *rep_ptr - '0'; - PCRE2_SIZE start = ovector[2 * grp_num]; - PCRE2_SIZE end = ovector[2 * grp_num + 1]; - if (start != PCRE2_UNSET) + gint grp_idx = *rep_ptr - '0'; + if (grp_idx < num_grps) { - size_t group_len = end - start; - g_string_append_len_inline(replacement_string, state->lhs_str + start, group_len); + PCRE2_SIZE start = ovector[2 * grp_idx]; + PCRE2_SIZE end = ovector[2 * grp_idx + 1]; + if (start != PCRE2_UNSET) + { + g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr - 1); + last_ptr = rep_ptr + 1; + size_t group_len = end - start; + g_string_append_len(replacement_string, state->lhs_str + start, group_len); + } } } rep_ptr++; } else - { - g_string_append_c_inline(replacement_string, *rep_ptr++); - } + rep_ptr++; } + g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr); return TRUE; } diff --git a/lib/filterx/tests/test_expr_regexp.c b/lib/filterx/tests/test_expr_regexp.c index d4c38f021..3447b0e40 100644 --- a/lib/filterx/tests/test_expr_regexp.c +++ b/lib/filterx/tests/test_expr_regexp.c @@ -547,6 +547,16 @@ Test(filterx_expr_regexp, regexp_subst_group_subst) filterx_object_unref(result_alt); } +Test(filterx_expr_regexp, regexp_subst_group_subst_without_ref) +{ + FilterXFuncRegexpSubstOpts opts = {.groups = TRUE}; + FilterXObject *result = _sub("(\\d{2})-(\\d{2})-(\\d{4})", "group without ref", "25-02-2022", opts); + cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string))); + const gchar *res = filterx_string_get_value_ref(result, NULL); + cr_assert_str_eq(res, "group without ref"); + filterx_object_unref(result); +} + static void setup(void) {