Merge pull request #36 from nixys/feat/35

feat(#35): Add regex capturing groups for column data types within the filters
nixys · Aug 8, 2024 · 08d2d5d · 08d2d5d
2 parents 1f9588e + c1432b9
commit 08d2d5d
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ nxs-data-anonymizer is a tool for anonymizing **PostgreSQL** and **MySQL/MariaDB
   - MySQL/MariaDB/Percona (5.7/8.0/8.1/all versions)
 - Flexible data faking based on:
   - Go templates and [Sprig template’s library](https://masterminds.github.io/sprig/) like [Helm](https://helm.sh/docs/chart_template_guide/functions_and_pipelines/). You may also use values of other columns for same row to build more flexible rules
-  - External commands you may execute to create table field value2
+  - External commands you may execute to create table field values
   - Security enforcement rules
   - Link cells across the database to generate the same values
 - Stream data processing. It means that you can a use the tool through a pipe in command line and redirect dump from source DB directly to the destination DB with required transformations
@@ -304,19 +304,25 @@ Additional filter functions:
 - `isNull`: compare a field value with `NULL`
 
 You may also use the following data in a templates:
+- Current table name. Statement: `{{ .TableName }}`
+- Current column name. Statement: `{{ .CurColumnName }}`
 - Values of other columns in the rules for same row (with values before substitutions). Statement: `{{ .Values.COLUMN_NAME }}` (e.g.: `{{ .Values.username }}`)
 - Global variables. Statement: `{{ .Variables.VARIABLE_NAME }}` (e.g.: `{{ .Variables.password }}`)  
+- Raw column data type. Statement: `{{ .ColumnTypeRaw }}`
+- Regex's capturing groups for the column data type. This variable has array type so you need to use `range` or `index` to access specific element. Statement: `{{ index .ColumnTypeGroups 0 0 }}`. See [Types](#types-settings) for details
 
 **Command**
 
 To anonymize a database fields you may use a commands (scripts or binaries) with any logic you need. The command's concept has following properties:
 - The command's `stdout` will be used as a new value for the anonymized field
 - Command must return zero exit code, otherwise nxs-data-anonymizer will falls with error (in this case `stderr` will be used as an error text)
 - Environment variables with the row data are available within the command:
-  - `ENVVARTABLE`: contains a name of the filtered table
-  - `ENVVARCURCOLUMN`: contains the current column name 
-  - `ENVVARGLOBAL_{VARIABLE_NAME}`: contains value for specified global variable
+  - `ENVVARTABLE`: contains a name of the current table
+  - `ENVVARCURCOLUMN`: contains the current column name
   - `ENVVARCOLUMN_{COLUMN_NAME}`: contains values (before substitutions) for all columns for the current row
+  - `ENVVARGLOBAL_{VARIABLE_NAME}`: contains value for specified global variable
+  - `ENVVARCOLUMNTYPERAW`: contains raw column data type
+  - `ENVVARCOLUMNTYPEGROUP_{GROUP_NUM}_{SUBGROUPNUM}`: contains regex's capturing groups for the column data type. See [Types](#types-settings) for details
 
 ##### Security settings
 
@@ -410,7 +416,7 @@ _Values to masquerade a columns in accordance with the types see below._
 
 | Option         | Type   | Required | Default value | Description                                                      |
 |---             | :---:  | :---:    | :---:         |---                                                               |
-| `regex`      | String | Yes       | -      | Regular expression. Will be checked for match for column data type (in `CREATE TABLE` section) |
+| `regex`      | String | Yes       | -      | Regular expression. Will be checked for match for column data type (in `CREATE TABLE` section). Able to use capturing groups within the regex that available as an additional variable data in the filters (see [Columns](#columns-settings) for details). This ability helps to create more flexible rules to generate the cells value in accordance with  data type  features |
 | `rule`      | [Columns](#columns-settings) | Yes       | -      | Rule will be applied columns with data types matched for specified regular expression |
 
 #### Example

diff --git a/modules/anonymizers/mysql/mysql.go b/modules/anonymizers/mysql/mysql.go
@@ -109,22 +109,16 @@ var typeKeys = map[string]columnType{
 
 func userCtxInit(s InitOpts) (*userCtx, error) {
 
-	trc := []relfilter.TypeRuleOpts{}
-	trd := []relfilter.TypeRuleOpts{}
-	if s.Security.ColumnsPolicy == misc.SecurityPolicyColumnsRandomize {
-		trc = s.Rules.TypeRuleCustom
-		trd = typeRuleDefault
-	}
-
 	f, err := relfilter.Init(
 		relfilter.InitOpts{
 			Variables:        s.Variables,
 			Link:             s.Link,
 			TableRules:       s.Rules.TableRules,
 			DefaultRules:     s.Rules.DefaultRules,
 			ExceptionColumns: s.Rules.ExceptionColumns,
-			TypeRuleCustom:   trc,
-			TypeRuleDefault:  trd,
+			TypeRuleCustom:   s.Rules.TypeRuleCustom,
+			TypeRuleDefault:  typeRuleDefault,
+			ColumnsPolicy:    s.Security.ColumnsPolicy,
 		},
 	)
 	if err != nil {

diff --git a/modules/anonymizers/mysql/security_types.go b/modules/anonymizers/mysql/security_types.go
@@ -15,7 +15,7 @@ var typeRuleDefault = []relfilter.TypeRuleOpts{
 
 	// String
 	{
-		Selector: "(?i)^char",
+		Selector: "(?i)^char\\((\\d+)\\)|^char ",
 		Rule: relfilter.ColumnRuleOpts{
 			Type:   misc.ValueTypeTemplate,
 			Value:  securityTypeString,

diff --git a/modules/anonymizers/pgsql/pgsql.go b/modules/anonymizers/pgsql/pgsql.go
@@ -53,22 +53,16 @@ type securityCtx struct {
 
 func userCtxInit(s InitOpts) (*userCtx, error) {
 
-	trc := []relfilter.TypeRuleOpts{}
-	trd := []relfilter.TypeRuleOpts{}
-	if s.Security.ColumnsPolicy == misc.SecurityPolicyColumnsRandomize {
-		trc = s.Rules.TypeRuleCustom
-		trd = typeRuleDefault
-	}
-
 	f, err := relfilter.Init(
 		relfilter.InitOpts{
 			Variables:        s.Variables,
 			Link:             s.Link,
 			TableRules:       s.Rules.TableRules,
 			DefaultRules:     s.Rules.DefaultRules,
 			ExceptionColumns: s.Rules.ExceptionColumns,
-			TypeRuleCustom:   trc,
-			TypeRuleDefault:  trd,
+			TypeRuleCustom:   s.Rules.TypeRuleCustom,
+			TypeRuleDefault:  typeRuleDefault,
+			ColumnsPolicy:    s.Security.ColumnsPolicy,
 		},
 	)
 	if err != nil {

diff --git a/modules/filters/relfilter/column.go b/modules/filters/relfilter/column.go
@@ -1,13 +1,22 @@
 package relfilter
 
+import "fmt"
+
 type columns struct {
 	cc []*column
 	m  map[string]*column
 }
 
 type column struct {
-	n       string
-	rawType string
+	n string
+	t columnTypes
+}
+
+type columnTypes struct {
+	raw    string
+	groups [][]string
+	r      *ColumnRuleOpts
+	env    []string
 }
 
 func columnsInit() columns {
@@ -17,11 +26,37 @@ func columnsInit() columns {
 	}
 }
 
-func (c *columns) add(name string, rt string) {
+func (c *columns) add(name string, rt string, pts [][]string, r *ColumnRuleOpts) {
+
+	env := []string{fmt.Sprintf("%s=%s", envVarColumnTypeRAW, rt)}
+
+	if pts != nil {
+		for i, g := range pts {
+			for j, sg := range g {
+
+				if j == 0 {
+					env = append(
+						env,
+						fmt.Sprintf("%s%d=%s", envVarColumnTypeGroupPrefix, i, sg),
+					)
+				} else {
+					env = append(
+						env,
+						fmt.Sprintf("%s%d_%d=%s", envVarColumnTypeGroupPrefix, i, j-1, sg),
+					)
+				}
+			}
+		}
+	}
 
 	v := column{
-		n:       name,
-		rawType: rt,
+		n: name,
+		t: columnTypes{
+			raw:    rt,
+			groups: pts,
+			r:      r,
+			env:    env,
+		},
 	}
 
 	c.cc = append(c.cc, &v)

diff --git a/modules/filters/relfilter/filter.go b/modules/filters/relfilter/filter.go
@@ -18,6 +18,8 @@ type InitOpts struct {
 	DefaultRules     map[string]ColumnRuleOpts
 	ExceptionColumns []string
 
+	ColumnsPolicy misc.SecurityPolicyColumnsType
+
 	TypeRuleCustom  []TypeRuleOpts
 	TypeRuleDefault []TypeRuleOpts
 }
@@ -63,6 +65,8 @@ type rules struct {
 	defaultRules     map[string]ColumnRuleOpts
 	exceptionColumns map[string]any
 
+	columnsPolicy misc.SecurityPolicyColumnsType
+
 	typeRuleCustom  []typeRule
 	typeRuleDefault []typeRule
 
@@ -108,10 +112,12 @@ type execFilterOpts struct {
 const uniqueAttempts = 5
 
 const (
-	envVarGlobalPrefix = "ENVVARGLOBAL_"
-	envVarTable        = "ENVVARTABLE"
-	envVarColumnPrefix = "ENVVARCOLUMN_"
-	envVarCurColumn    = "ENVVARCURCOLUMN"
+	envVarGlobalPrefix          = "ENVVARGLOBAL_"
+	envVarTable                 = "ENVVARTABLE"
+	envVarColumnPrefix          = "ENVVARCOLUMN_"
+	envVarCurColumn             = "ENVVARCURCOLUMN"
+	envVarColumnTypeRAW         = "ENVVARCOLUMNTYPERAW"
+	envVarColumnTypeGroupPrefix = "ENVVARCOLUMNTYPEGROUP_"
 )
 
 type applyRule struct {
@@ -219,6 +225,7 @@ func Init(opts InitOpts) (*Filter, error) {
 			exceptionColumns: excpts,
 			typeRuleCustom:   trc,
 			typeRuleDefault:  trd,
+			columnsPolicy:    opts.ColumnsPolicy,
 		},
 	}, nil
 }
@@ -247,7 +254,26 @@ func (filter *Filter) TableRulesLookup(name string) map[string]ColumnRuleOpts {
 
 // ColumnAdd adds new column into current data set
 func (filter *Filter) ColumnAdd(name string, rt string) {
-	filter.tableData.columns.add(name, rt)
+
+	//var rl *ColumnRuleOpts
+
+	for _, r := range filter.rules.typeRuleCustom {
+		gd := r.Rgx.FindAllStringSubmatch(rt, -1)
+		if len(gd) > 0 {
+			filter.tableData.columns.add(name, rt, gd, &r.Rule)
+			return
+		}
+	}
+
+	for _, r := range filter.rules.typeRuleDefault {
+		gd := r.Rgx.FindAllStringSubmatch(rt, -1)
+		if len(gd) > 0 {
+			filter.tableData.columns.add(name, rt, gd, &r.Rule)
+			return
+		}
+	}
+
+	filter.tableData.columns.add(name, rt, nil, nil)
 }
 
 func (filter *Filter) ColumnGetName(index int) string {
@@ -346,47 +372,21 @@ func (filter *Filter) Apply() error {
 			continue
 		}
 
-		// Check custom type rule for column
-		if b := func() bool {
-			for _, r := range filter.rules.typeRuleCustom {
-				if r.Rgx.Match([]byte(c.rawType)) {
-					rls = append(
-						rls,
-						applyRule{
-							c:  c,
-							i:  i,
-							cr: r.Rule,
-						},
-					)
-					return true
-				}
-			}
-			return false
-		}(); b {
-			continue
-		}
+		// Other rules if required
 
-		// Check default type rule for column
-		if b := func() bool {
-			for _, r := range filter.rules.typeRuleDefault {
-				if r.Rgx.Match([]byte(c.rawType)) {
-					rls = append(
-						rls,
-						applyRule{
-							c:  c,
-							i:  i,
-							cr: r.Rule,
-						},
-					)
-					return true
-				}
+		// Default rules for types
+		if filter.rules.columnsPolicy == misc.SecurityPolicyColumnsRandomize {
+			if c.t.r != nil {
+				rls = append(
+					rls,
+					applyRule{
+						c:  c,
+						i:  i,
+						cr: *c.t.r,
+					},
+				)
 			}
-			return false
-		}(); b {
-			continue
 		}
-
-		// Other rules if required
 	}
 
 	// Apply rules
@@ -454,28 +454,31 @@ func (filter *Filter) applyRules(tname string, rls []applyRule) error {
 		} else {
 
 			type tplData struct {
-				TableName string
-				Values    map[string]string
-				Variables map[string]string
+				TableName        string
+				CurColumnName    string
+				Values           map[string]string
+				Variables        map[string]string
+				ColumnTypeRaw    string
+				ColumnTypeGroups [][]string
 			}
 
 			td := tplData{
-				TableName: tname,
-				Values:    valOld,
-				Variables: filter.rules.variables,
+				TableName:        tname,
+				CurColumnName:    r.c.n,
+				Values:           valOld,
+				Variables:        filter.rules.variables,
+				ColumnTypeRaw:    r.c.t.raw,
+				ColumnTypeGroups: r.c.t.groups,
 			}
 
-			tdenv := []string{
+			tde := []string{
 				fmt.Sprintf("%s=%s", envVarTable, tname),
-			}
-
-			// Create tmp env variables with current column name
-			tde := append(
-				tdenv,
 				fmt.Sprintf("%s=%s", envVarCurColumn, r.c.n),
-			)
+			}
 
-			tdenv = append(tdenv, valEnvGlob...)
+			tde = append(tde, valEnvOld...)
+			tde = append(tde, valEnvGlob...)
+			tde = append(tde, r.c.t.env...)
 
 			v, err = filter.applyColumnFilter(r.c.n, r.cr, td, tde)
 			if err != nil {