Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filterx regexp_subst supports match groups #394

Merged
merged 4 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 58 additions & 3 deletions lib/filterx/expr-regexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
FILTERX_FUNC_REGEXP_SUBST_FLAG_GLOBAL_NAME"=(boolean) " \
FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME"=(boolean) " \
FILTERX_FUNC_REGEXP_SUBST_FLAG_IGNORECASE_NAME"=(boolean) " \
FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME"=(boolean))" \
FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME"=(boolean)" \
FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME"=(boolean))" \

#define FILTERX_FUNC_REGEXP_SEARCH_USAGE "Usage: regexp_search(string, pattern)"

Expand All @@ -48,6 +49,7 @@ typedef struct FilterXReMatchState_
FilterXObject *lhs_obj;
const gchar *lhs_str;
gsize lhs_str_len;
gint rc;
} FilterXReMatchState;

static void
Expand Down Expand Up @@ -112,6 +114,7 @@ _match_inner(FilterXReMatchState *state, pcre2_code_8 *pattern, gint start_offse
gint rc = pcre2_match(pattern, (PCRE2_SPTR) state->lhs_str, (PCRE2_SIZE) state->lhs_str_len, (PCRE2_SIZE) start_offset,
0,
state->match_data, NULL);
state->rc = rc;
if (rc < 0)
{
switch (rc)
Expand Down Expand Up @@ -543,18 +546,67 @@ _is_zero_length_match(PCRE2_SIZE *ovector)
return ovector[0] == ovector[1];
}

static gboolean
_build_replacement_stirng_with_match_groups(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state,
GString *replacement_string)
{
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(state->match_data);
g_string_set_size(replacement_string, 0);
const gchar *rep_ptr = self->replacement;
const gchar *last_ptr = rep_ptr;
gint num_grps = state->rc;

while (*rep_ptr)
{
if (*rep_ptr == '\\')
{
rep_ptr++;
if (*rep_ptr >= '1' && *rep_ptr <= '9')
{
gint grp_idx = *rep_ptr - '0';
if (grp_idx < num_grps)
{
PCRE2_SIZE start = ovector[2 * grp_idx];
PCRE2_SIZE end = ovector[2 * grp_idx + 1];
if (start != PCRE2_UNSET)
{
g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr - 1);
last_ptr = rep_ptr + 1;
size_t group_len = end - start;
g_string_append_len(replacement_string, state->lhs_str + start, group_len);
}
}
}
rep_ptr++;
}
else
rep_ptr++;
}
g_string_append_len(replacement_string, last_ptr, rep_ptr - last_ptr);
return TRUE;
}

static FilterXObject *
_replace_matches(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state)
{
GString *new_value = scratch_buffers_alloc();
PCRE2_SIZE *ovector = NULL;
gint pos = 0;
const gchar *replacement_string = self->replacement;

if (self->opts.groups)
{
GString *rep_str = scratch_buffers_alloc();
_build_replacement_stirng_with_match_groups(self, state, rep_str);
replacement_string = rep_str->str;
}

do
{
ovector = pcre2_get_ovector_pointer(state->match_data);

g_string_append_len(new_value, state->lhs_str + pos, _start_offset(ovector) - pos);
g_string_append(new_value, self->replacement);
g_string_append(new_value, replacement_string);

if (_is_zero_length_match(ovector))
{
Expand All @@ -574,7 +626,7 @@ _replace_matches(const FilterXFuncRegexpSubst *self, FilterXReMatchState *state)

// handle the very last of zero lenght matches
if (_is_zero_length_match(ovector))
g_string_append(new_value, self->replacement);
g_string_append(new_value, replacement_string);

return filterx_string_new(new_value->str, new_value->len);
}
Expand Down Expand Up @@ -689,6 +741,9 @@ _extract_optional_flags(FilterXFuncRegexpSubst *self, FilterXFunctionArgs *args,
if (!_extract_literal_bool(args, FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME, &self->opts.utf8,
error))
return FALSE;
if (!_extract_literal_bool(args, FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME,
&self->opts.groups, error))
return FALSE;
return TRUE;
}

Expand Down
2 changes: 2 additions & 0 deletions lib/filterx/expr-regexp.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#define FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME "utf8"
#define FILTERX_FUNC_REGEXP_SUBST_FLAG_IGNORECASE_NAME "ignorecase"
#define FILTERX_FUNC_REGEXP_SUBST_FLAG_NEWLINE_NAME "newline"
#define FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME "groups"

typedef struct FilterXFuncRegexpSubstOpts_
{
Expand All @@ -41,6 +42,7 @@ typedef struct FilterXFuncRegexpSubstOpts_
gboolean utf8;
gboolean ignorecase;
gboolean newline;
gboolean groups;
} FilterXFuncRegexpSubstOpts;

FilterXExpr *filterx_expr_regexp_match_new(FilterXExpr *lhs, const gchar *pattern);
Expand Down
30 changes: 30 additions & 0 deletions lib/filterx/tests/test_expr_regexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,9 @@ _build_subst_func(const gchar *pattern, const gchar *repr, const gchar *str, Fil
if (opts.utf8)
args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_UTF8_NAME,
filterx_literal_new(filterx_boolean_new(TRUE))));
if (opts.groups)
args = g_list_append(args, filterx_function_arg_new(FILTERX_FUNC_REGEXP_SUBST_FLAG_GROUPS_NAME,
filterx_literal_new(filterx_boolean_new(TRUE))));

GError *err = NULL;
FilterXExpr *func = filterx_function_regexp_subst_new(filterx_function_args_new(args, NULL), &err);
Expand Down Expand Up @@ -527,6 +530,33 @@ Test(filterx_expr_regexp, regexp_subst_match_opt_ignorecase_nojit)
filterx_object_unref(result_alt);
}

Test(filterx_expr_regexp, regexp_subst_group_subst)
{
FilterXFuncRegexpSubstOpts opts = {};
FilterXObject *result = _sub("(\\d{2})-(\\d{2})-(\\d{4})", "\\3-\\2-\\1", "25-02-2022", opts);
cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string)));
const gchar *res = filterx_string_get_value_ref(result, NULL);
cr_assert_str_eq(res, "\\3-\\2-\\1");
filterx_object_unref(result);

FilterXFuncRegexpSubstOpts opts_alt = {.groups = TRUE};
FilterXObject *result_alt = _sub("(\\d{2})-(\\d{2})-(\\d{4})", "\\3-\\2-\\1", "25-02-2022", opts_alt);
cr_assert(filterx_object_is_type(result_alt, &FILTERX_TYPE_NAME(string)));
const gchar *res_alt = filterx_string_get_value_ref(result_alt, NULL);
cr_assert_str_eq(res_alt, "2022-02-25");
filterx_object_unref(result_alt);
}

Test(filterx_expr_regexp, regexp_subst_group_subst_without_ref)
{
FilterXFuncRegexpSubstOpts opts = {.groups = TRUE};
FilterXObject *result = _sub("(\\d{2})-(\\d{2})-(\\d{4})", "group without ref", "25-02-2022", opts);
cr_assert(filterx_object_is_type(result, &FILTERX_TYPE_NAME(string)));
const gchar *res = filterx_string_get_value_ref(result, NULL);
cr_assert_str_eq(res, "group without ref");
filterx_object_unref(result);
}

static void
setup(void)
{
Expand Down
8 changes: 7 additions & 1 deletion tests/light/functional_tests/filterx/test_filterx.py
Original file line number Diff line number Diff line change
Expand Up @@ -1938,6 +1938,9 @@ def test_regexp_subst(config, syslog_ng):
$MSG.orgrp_global = regexp_subst("foobarbaz", "(fo|az)", "!", global=true);
$MSG.ignore_case_control = regexp_subst("FoObArBaz", "(o|a)", "!", global=true);
$MSG.ignore_case = regexp_subst("FoObArBaz", "(o|a)", "!", ignorecase=true, global=true);
$MSG.groups_off = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1");;
$MSG.groups_on = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "\\3-\\2-\\1", groups=true);
$MSG.mixed_grps = regexp_subst("25-02-2022", /(\d{2})-(\d{2})-(\d{4})/, "foo:\\3-\\2-\\1:bar:baz", groups=true);
""",
)
syslog_ng.start(config)
Expand All @@ -1956,7 +1959,10 @@ def test_regexp_subst(config, syslog_ng):
r""""zero_length_match_global":"!f!o!o!b!a!r!b!a!z!","""
r""""orgrp_global":"!obarb!","""
r""""ignore_case_control":"F!ObArB!z","""
r""""ignore_case":"F!!b!rB!z"}""" + "\n"
r""""ignore_case":"F!!b!rB!z","""
r""""groups_off":"\\3-\\2-\\1","""
r""""groups_on":"2022-02-25","""
r""""mixed_grps":"foo:2022-02-25:bar:baz"}""" + "\n"
)
assert file_true.read_log() == exp

Expand Down