Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filterx regexp_subst match group changes #409

Merged
merged 4 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 79 additions & 27 deletions lib/filterx/expr-regexp-subst.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "filterx/expr-regexp-common.h"
#include "compat/pcre.h"
#include "scratch-buffers.h"
#include <ctype.h>

DEFINE_FUNC_FLAG_NAMES(FilterXRegexpSubstFlags,
FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT_NAME,
Expand All @@ -53,6 +54,7 @@ DEFINE_FUNC_FLAG_NAMES(FilterXRegexpSubstFlags,
FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME"=(boolean)" \
FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME"=(boolean))" \

#define FILTERX_FUNC_REGEXP_SUBST_GRP_ID_MAX_DIGITS 3

typedef struct FilterXFuncRegexpSubst_
{
Expand All @@ -63,43 +65,84 @@ typedef struct FilterXFuncRegexpSubst_
FLAGSET flags;
} FilterXFuncRegexpSubst;

static gchar *
_next_matchgrp_ref(gchar *from, gchar **to)
{
if (from == NULL || *from == '\0')
return NULL;
g_assert(to);
while (*from != '\0')
{
if ((*from == '\\') && isdigit(*(from + 1)))
{
gchar *start = from;
from += 2;
while (isdigit(*from) && from - start <= FILTERX_FUNC_REGEXP_SUBST_GRP_ID_MAX_DIGITS)
{
from++;
}
*to = from;
return start;
}
from++;
}
return NULL;
}

static gboolean
_parse_machgrp_ref(const gchar *from, const gchar *to, gint *value)
{
if (!from || !to || !value || from >= to || to > from + 5)
{
return FALSE;
}

if (*from != '\\')
{
return FALSE;
}

from++;
*value = 0;

while (from < to && isdigit(*from))
{
*value = (*value * 10) + (*from - '0');
from++;
}

return from == to;
}

static gboolean
_build_replacement_stirng_with_match_groups(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state,
GString *replacement_string)
{
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data);
g_string_set_size(replacement_string, 0);
const gchar *rep_ptr = self->replacement;
const gchar *last_ptr = rep_ptr;
gint num_grps = state->rc;
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data);

while (*rep_ptr)
gchar *pos = self->replacement;
gchar *last = pos;
gchar *close = NULL;
gint idx = -1;
while ((pos = _next_matchgrp_ref(pos, &close)) != NULL)
{
if (*rep_ptr == '\\')
if (_parse_machgrp_ref(pos, close, &idx) && (idx < num_grps))
{
rep_ptr++;
if (*rep_ptr >= '1' && *rep_ptr <= '9')
PCRE2_SIZE start = ovector[2 * idx];
PCRE2_SIZE end = ovector[2 * idx + 1];
if (start != PCRE2_UNSET)
{
gint grp_idx = *rep_ptr - '0';
if (grp_idx < num_grps)
{
PCRE2_SIZE start = ovector[2 * grp_idx];
PCRE2_SIZE end = ovector[2 * grp_idx + 1];
if (start != PCRE2_UNSET)
{
g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr - 1);
last_ptr = rep_ptr + 1;
size_t group_len = end - start;
g_string_append_len(replacement_string, state->lhs_str + start, group_len);
}
}
g_string_append_len(replacement_string, last, pos - last);
last = close;
size_t group_len = end - start;
g_string_append_len(replacement_string, state->lhs_str + start, group_len);
}
rep_ptr++;
}
else
rep_ptr++;
pos = close;
}
g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr);
g_string_append_len(replacement_string, last, pos - last);
return TRUE;
}

Expand All @@ -117,7 +160,6 @@ _replace_matches(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state)
_build_replacement_stirng_with_match_groups(self, state, rep_str);
replacement_string = rep_str->str;
}

do
{
ovector = pcre2_get_ovector_pointer(state->match_data);
Expand Down Expand Up @@ -253,6 +295,13 @@ _extract_optional_flags(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args,
return TRUE;
}

static gboolean
_contains_match_grp_ref(gchar *str)
{
gchar *close = NULL;
return _next_matchgrp_ref(str, &close) != NULL;
}

static gboolean
_extract_subst_args(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, GError **error)
{
Expand All @@ -277,7 +326,9 @@ _extract_subst_args(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args, GEr
self->replacement = _extract_subst_replacement_arg(args, error);
if (!self->replacement)
return FALSE;

// turn off group mode if there is no match grp ref due to it's performance impact
if (!_contains_match_grp_ref(self->replacement))
set_flag(&self->flags, FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS, FALSE);

return TRUE;
}
Expand Down Expand Up @@ -322,7 +373,8 @@ filterx_function_regexp_subst_new(FilterXFunctionArgs *args, GError **error)
self->super.super.deinit = _subst_deinit;
self->super.super.free_fn = _subst_free;

reset_flags(&self->flags, FLAG_VAL(FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT));
reset_flags(&self->flags, FLAG_VAL(FILTERX_FUNC_REGEXP_SUBST_FLAG_JIT) | FLAG_VAL(
FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS));
if (!_extract_subst_args(self, args, error) ||
!filterx_function_args_check(args, error))
goto error;
Expand Down
37 changes: 35 additions & 2 deletions lib/filterx/tests/test_expr_regexp_subst.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ _build_subst_func(const gchar *pattern, const gchar *repr, const gchar *str, Fil
if (opts.utf8)
args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME,
filterx_literal_new(filterx_boolean_new(TRUE))));
if (opts.groups)
if (!opts.groups)
args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME,
filterx_literal_new(filterx_boolean_new(TRUE))));
filterx_literal_new(filterx_boolean_new(FALSE))));

GError *err = NULL;
FilterXExpr *func = filterx_function_regexp_subst_new(filterx_function_args_new(args, NULL), &err);
Expand Down Expand Up @@ -350,6 +350,39 @@ Test(filterx_expr_regexp_subst, regexp_subst_group_subst_without_ref)
filterx_object_unref(result);
}

Test(filterx_expr_regexp_subst, regexp_subst_group_reference_with_multiple_digits)
{
FilterXFuncRegexpSubstOpts opts = {.groups = TRUE};
FilterXObject *result =
_sub("(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})(\\d{2})",
"\\12-\\11-\\10-\\9\\8\\7\\6\\5\\4\\3\\2\\1", "010203040506070809101112", opts);
cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string)));
const gchar *res = filterx_string_get_value_ref(result, NULL);
cr_assert_str_eq(res, "12-11-10-090807060504030201");
filterx_object_unref(result);
}

Test(filterx_expr_regexp_subst, regexp_subst_group_do_not_replace_unknown_ref)
{
FilterXFuncRegexpSubstOpts opts = {.groups = TRUE};
FilterXObject *result = _sub("(\\d{2})(\\d{2})(\\d{2})",
"\\3\\20\\1", "010203", opts);
cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string)));
const gchar *res = filterx_string_get_value_ref(result, NULL);
cr_assert_str_eq(res, "03\\2001");
filterx_object_unref(result);
}

Test(filterx_expr_regexp_subst, regexp_subst_group_limited_digits_and_zero_prefixes)
{
FilterXFuncRegexpSubstOpts opts = {.groups = TRUE};
FilterXObject *result = _sub("(\\w+),(\\w+),(\\w+)", "\\3\\02\\0013.14", "baz,bar,foo", opts);
cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string)));
const gchar *res = filterx_string_get_value_ref(result, NULL);
cr_assert_str_eq(res, "foobarbaz3.14");
filterx_object_unref(result);
}

static void
setup(void)
{
Expand Down
12 changes: 8 additions & 4 deletions tests/light/functional_tests/filterx/test_filterx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2004,9 +2004,11 @@ def test_regexp_subst(config, syslog_ng):
$MSG.orgrp_global = regexp_subst("foobarbaz", "(fo|az)", "!", global=true);
$MSG.ignore_case_control = regexp_subst("FoObArBaz", "(o|a)", "!", global=true);
$MSG.ignore_case = regexp_subst("FoObArBaz", "(o|a)", "!", ignorecase=true, global=true);
$MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1");;
$MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=true);
$MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz", groups=true);
$MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=false);
$MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1");
$MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz");
$MSG.multi_digit_grps = regexp_subst("010203040506070809101112", /(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/, "\\10-\\11-\\12");
$MSG.prefixing_zeros = regexp_subst("foobar", /^(.*)$/, "\\001012345");
""",
)
syslog_ng.start(config)
Expand All @@ -2028,7 +2030,9 @@ def test_regexp_subst(config, syslog_ng):
r""""ignore_case":"F!!b!rB!z","""
r""""groups_off":"\\3-\\2-\\1","""
r""""groups_on":"2022-02-25","""
r""""mixed_grps":"foo:2022-02-25:bar:baz"}""" + "\n"
r""""mixed_grps":"foo:2022-02-25:bar:baz","""
r""""multi_digit_grps":"10-11-12","""
r""""prefixing_zeros":"foobar012345"}""" + "\n"
)
assert file_true.read_log() == exp

Expand Down
Loading