-
Notifications
You must be signed in to change notification settings - Fork 0
/
full.yml
649 lines (572 loc) · 38.7 KB
/
full.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
#
# JBZoo Toolbox - Csv-Blueprint.
#
# This file is part of the JBZoo Toolbox project.
# For the full copyright and license information, please view the LICENSE
# file that was distributed with this source code.
#
# @license MIT
# @copyright Copyright (C) JBZoo.com, All rights reserved.
# @see https://github.com/JBZoo/Csv-Blueprint
#
# It's a complete example of the CSV schema file in YAML format.
# See copy of the file without comments here ./schema-examples/full_clean.yml
# Just meta
name: CSV Blueprint Schema Example # Name of a CSV file. Not used in the validation process.
description: |- # Any description of the CSV file. Not used in the validation process.
This YAML file provides a detailed description and validation rules for CSV files
to be processed by CSV Blueprint tool. It includes specifications for file name patterns,
CSV formatting options, and extensive validation criteria for individual columns and their values,
supporting a wide range of data validation rules from basic type checks to complex regex validations.
This example serves as a comprehensive guide for creating robust CSV file validations.
# Include another schema and define an alias for it.
presets:
my-preset: ./preset_users.yml # Define preset alias "my-preset". See README.md for details.
# Regular expression to match the file name. If not set, then no pattern check.
# This allows you to pre-validate the file name before processing its contents.
# Feel free to check parent directories as well.
# See: https://www.php.net/manual/en/reference.pcre.pattern.syntax.php
filename_pattern: /\.csv$/i
# preset: my-preset # See README.md for details.
# Here are default values to parse CSV file.
# You can skip this section if you don't need to override the default values.
csv:
preset: my-preset # See README.md for details.
header: true # If the first row is a header. If true, name of each column is required.
delimiter: , # Delimiter character in CSV file.
quote_char: \ # Quote character in CSV file.
enclosure: '"' # Enclosure for each field in CSV file.
encoding: utf-8 # (Experimental) Only utf-8, utf-16, utf-32.
bom: false # (Experimental) If the file has a BOM (Byte Order Mark) at the beginning.
# Structural rules for the CSV file. These rules are applied to the entire CSV file.
# They are not(!) related to the data in the columns.
# You can skip this section if you don't need to override the default values.
structural_rules: # Here are default values.
preset: my-preset # See README.md for details.
strict_column_order: true # Ensure columns in CSV follow the same order as defined in this YML schema. It works only if "csv.header" is true.
allow_extra_columns: false # Allow CSV files to have more columns than specified in this YML schema.
# Add any extra data you want. It will be ignored by the tool but available for your own code.
# You can use any format and store anything. Examples:
# extra: 'some text'
# extra: [some, options, here]
# extra: 42
extra:
- key: "value"
# Description of each column in CSV.
# It is recommended to present each column in the same order as presented in the CSV file.
# This will not affect the validator, but will make it easier for you to navigate.
# For convenience, use the first line as a header (if possible).
columns:
- preset: my-preset/login # Add preset rules for the column. See README.md for details.
name: Column Name (header) # Any custom name of the column in the CSV file (first row). Required if "csv.header" is true.
description: Lorem ipsum # Description of the column. Not used in the validation process.
example: Some example # Example of the column value. Schema will also check this value on its own.
# If the column is required. If true, the column must be present in the CSV file. If false, the column can be missing in the CSV file.
# So, if you want to make the column optional, set this value to false, and it will validate the column only if it is present.
# By default, the column is required. It works only if "csv.header" is true and "structural_rules.allow_extra_columns" is false.
required: true
# Add any extra data you want. It will be ignored by the tool but available for your own code.
# You can use any format and store anything. Examples:
# extra: 'some text'
# extra: [some, options, here]
# extra: 42
extra:
- key: "value"
####################################################################################################################
# Data validation for each(!) value in the column. Please, see notes in README.md
# Every rule is optional.
rules:
preset: my-preset/login # Add preset rules for the column. See README.md for details.
# General rules
not_empty: true # Value is not an empty string. Actually checks if the string length is not 0.
exact_value: Some string # Exact value for string in the column.
allow_values: [ y, n, "" ] # Strict set of values that are allowed.
not_allow_values: [ invalid ] # Strict set of values that are NOT allowed.
# Any valid regex pattern. See: https://www.php.net/manual/en/reference.pcre.pattern.syntax.php
# Of course it's a super powerful tool to verify any sort of string data.
# Please, be careful. Regex is a powerful tool, but it can be very dangerous if used incorrectly.
# Remember that if you want to solve a problem with regex, you now have two problems.
# But have it your way, then happy debugging! https://regex101.com
regex: /^[\d]{2}$/
# Checks length of a string including spaces (multibyte safe).
length_min: 1 # x >= 1
length_greater: 2 # x > 2
length_not: 0 # x != 0
length: 7 # x == 7
length_less: 8 # x < 8
length_max: 9 # x <= 9
# Basic string checks
is_trimmed: true # Only trimmed strings. Example: "Hello World" (not " Hello World ").
is_lowercase: true # String is only lower-case. Example: "hello world".
is_uppercase: true # String is only upper-case. Example: "HELLO WORLD".
is_capitalize: true # String is only capitalized. Example: "Hello World".
is_sentence: true # Sentence with at least one space. Example: "Hello world!".
# Count number of words used in a string
# Note that multibyte locales are not supported.
# Example: "Hello World, 123" - 2 words only (123 is not a word).
word_count_min: 1 # x >= 1
word_count_greater: 2 # x > 2
word_count_not: 0 # x != 0
word_count: 7 # x == 7
word_count_less: 8 # x < 8
word_count_max: 9 # x <= 9
# Contains rules
contains: World # Example: "Hello World!". The string must contain "World" in any place.
contains_none: [ a, b ] # All the strings must NOT be part of a CSV value.
contains_one: [ a, b ] # Only one of the strings must be part of the CSV value.
contains_any: [ a, b ] # At least one of the string must be part of the CSV value.
contains_all: [ a, b ] # All the strings must be part of a CSV value.
starts_with: 'prefix ' # Example: "prefix Hello World".
ends_with: ' suffix' # Example: "Hello World suffix".
# Numeric
is_int: true # Check format only. Can be negative and positive. Without any separators.
is_float: true # Check format only. Can be negative and positive. Dot as decimal separator.
# Under the hood it converts and compares as float values.
# Comparison accuracy is 10 digits after a dot.
# Scientific number format is also supported. Example: "1.2e3"
num_min: 1.0 # x >= 1.0
num_greater: 2.0 # x > 2.0
num_not: 5.0 # x != 5.0
num: 7.0 # x == 7.0
num_less: 8.0 # x < 8.0
num_max: 9.0 # x <= 9.0
# Number of digits after the decimal point (with zeros)
precision_min: 1 # x >= 1
precision_greater: 2 # x > 2
precision_not: 0 # x != 0
precision: 7 # x == 7
precision_less: 8 # x < 8
precision_max: 9 # x <= 9
# Date & time
is_date: true # Accepts arbitrary date format. Is shows error if failed to convert to timestamp.
is_timezone: true # Allow only timezone identifiers. Case-insensitive. Example: "Europe/London", "utc".
is_timezone_offset: true # Allow only timezone offsets. Example: "+03:00".
is_time: true # Check if the cell value is a valid time in the format "HH:MM:SS AM/PM" / "HH:MM:SS" / "HH:MM". Case-insensitive.
is_leap_year: true # Check if the cell value is a leap year. Example: "2008", "2008-02-29 23:59:59 UTC".
# Dates. Under the hood, the strings are converted to timestamp and compared.
# This gives you the ability to use relative dates and any formatting you want.
# By default, it works in UTC. But you can specify your own timezone as part of the date string.
# Format: https://www.php.net/manual/en/datetime.format.php
# Parsing: https://www.php.net/manual/en/function.strtotime.php
# Timezones: https://www.php.net/manual/en/timezones.php
date_min: -100 years # Example of relative past date
date_greater: -99 days # Example of relative formats
date_not: 2006-01-02 15:04:05 -0700 Europe/Rome
date: 01 Jan 2000 # You can use any string that can be parsed by the strtotime function
date_less: now # Example of current date and time
date_max: +1 day # Example of relative future date
date_format: Y-m-d # Check strict format of the date.
# Date Intervals. Under the hood, the strings are converted to seconds and compared.
# See: https://www.php.net/manual/en/class.dateinterval.php
# See: https://www.php.net/manual/en/dateinterval.createfromdatestring.php
date_interval_min: PT0S # 0 seconds
date_interval_greater: 1day 1sec # 1 day and 1 second
date_interval_not: 100 days # Except for the 100 days
date_interval: P2W # Exactly 2 weeks
date_interval_less: PT23H59M59S # 23 hours, 59 minutes, and 59 seconds
date_interval_max: P1Y # 1 year
# Check an arbitrary date in a CSV cell for age (years).
# Actually it calculates the difference between the date and the current date.
# Convenient to use for age restrictions based on birthday.
# See the description of `date_*` functions for details on date formats.
date_age_min: 1 # x >= 1
date_age_greater: 14 # x > 14
date_age_not: 18 # x != 18
date_age: 21 # x == 21
date_age_less: 99 # x < 99
date_age_max: 100 # x <= 100
# Specific formats
is_bool: true # Allow only boolean values "true" and "false", case-insensitive.
is_binary: true # Both: with or without "0b" prefix. Example: "0b10" or "10".
is_octal: true # Validates octal numbers in the format "0o123".
is_hex: true # Both: with or without "0x" prefix. Example: "0x1A".
is_uuid: true # Validates whether the input is a valid UUID. It also supports validation of specific versions 1, 3, 4 and 5.
is_slug: true # Only slug format. Example: "my-slug-123". It can contain letters, numbers, and dashes.
is_currency_code: true # Validates an ISO 4217 currency code like GBP or EUR. Case-sensitive. See: https://en.wikipedia.org/wiki/ISO_4217.
is_base64: true # Validate if a string is Base64-encoded. Example: "cmVzcGVjdCE=".
is_angle: true # Check if the cell value is a valid angle (0.0 to 360.0).
# Safity checks
# Password strength calculation criteria include: Length (max 5 points, +1 every 2 characters),
# presence of uppercase letters (+1), lowercase letters (+1), numbers (+1), special characters (+1),
# spaces (+1), and penalties for consecutive sequences of uppercase, lowercase, or
# numbers (-0.5 each), repetitive sequences (-0.75 each), common weak passwords like "qwerty",
# and passwords under 6 characters (-2). Adjust scores to a 0 to 10 scale, with a minimum score of 0.
password_strength_min: 1 # x >= 1
password_strength_greater: 2 # x > 2
password_strength_not: 0 # x != 0
password_strength: 7 # x == 7
password_strength_less: 8 # x < 8
password_strength_max: 9 # x <= 9
is_password_safe_chars: true # Check that the cell value contains only safe characters for regular passwords. Allowed characters: a-z, A-Z, 0-9, !@#$%^&*()_+-=[]{};:'"|,.<>/?~.
# Internet
is_ip: true # Both: IPv4 or IPv6.
is_ip_v4: true # Only IPv4. Example: "127.0.0.1".
is_ip_v6: true # Only IPv6. Example: "2001:0db8:85a3:08d3:1319:8a2e:0370:7334".
is_ip_private: true # IPv4 has ranges: 10.0.0.0/8, 172.16.0.0/12 and 192.168.0.0/16. IPv6 has ranges starting with FD or FC.
is_ip_reserved: true # IPv4 has ranges: 0.0.0.0/8, 169.254.0.0/16, 127.0.0.0/8 and 240.0.0.0/4. IPv6 has ranges: ::1/128, ::/128, ::ffff:0:0/96 and fe80::/10.
ip_v4_range: [ '127.0.0.1-127.0.0.5', '127.0.0.0/21' ] # Check subnet mask or range for IPv4. Address must be in one of the ranges.
is_mac_address: true # The input is a valid MAC address. Example: 00:00:5e:00:53:01
is_domain: true # Only domain name. Example: "example.com".
is_public_domain_suffix: true # The input is a public ICANN domain suffix. Example: "com", "nom.br", "net" etc.
is_url: true # Only URL format. Example: "https://example.com/page?query=string#anchor".
is_email: true # Only email format. Example: "user@example.com".
# Validates if the given input is a valid JSON.
# This is possible if you escape all special characters correctly and use a special CSV format.
is_json: true # Example: {"foo":"bar"}.
# Geography
is_latitude: true # Can be integer or float. Example: 50.123456.
is_longitude: true # Can be integer or float. Example: -89.123456.
is_geohash: true # Check if the value is a valid geohash. Example: "u4pruydqqvj".
is_cardinal_direction: true # Valid cardinal direction. Case-insensitive. Available values: ["N", "S", "E", "W", "NE", "SE", "NW", "SW", "NONE"]
is_usa_market_name: true # Check if the value is a valid USA market name. Example: "New York, NY".
is_usa_state: true # Name or code of USA state name. Case-insensitive. Example: "CA" or "California".
# Validates whether the input is a country code in ISO 3166-1 standard.
# Available options: "alpha-2" (Ex: "US"), "alpha-3" (Ex: "USA"), "numeric" (Ex: "840").
# The rule uses data from iso-codes: https://salsa.debian.org/iso-codes-team/iso-codes.
country_code: alpha-2 # Country code in ISO 3166-1 standard. Examples: "US", "USA", "840"
# Validates whether the input is language code based on ISO 639.
# Available options: "alpha-2" (Ex: "en"), "alpha-3" (Ex: "eng").
# See: https://en.wikipedia.org/wiki/ISO_639.
language_code: alpha-2 # Examples: "en", "eng"
# Filesystem (with IO!)
is_file_exists: true # Check if file exists on the filesystem (It's FS IO operation!).
is_dir_exists: true # Check if directory exists on the filesystem (It's FS IO operation!).
# Mathematical
is_fibonacci: true # Validates whether the input follows the Fibonacci integer sequence. Example: "8", "13".
is_prime_number: true # Validates a prime number. Example: "3", "5", "7", "11".
is_even: true # Check if the value is an even number. Example: "2", "4", "6".
is_odd: true # Check if the value is an odd number. Example: "1", "7", "11".
is_roman: true # Validates if the input is a Roman numeral. Example: "I", "IV", "XX".
is_luhn: true # Luhn algorithm. See: https://en.wikipedia.org/wiki/Luhn_algorithm
# Identifications
phone: ALL # Validates if the input is a phone number. Specify the country code to validate the phone number for a specific country. Example: "ALL", "US", "BR".".
postal_code: US # Validate postal code by country code (alpha-2). Example: "02179". Extracted from https://www.geonames.org
is_iban: true # IBAN - International Bank Account Number. See: https://en.wikipedia.org/wiki/International_Bank_Account_Number
is_bic: true # Validates a Bank Identifier Code (BIC) according to ISO 9362 standards. See: https://en.wikipedia.org/wiki/ISO_9362
is_imei: true # Validates an International Mobile Equipment Identity (IMEI). See: https://en.wikipedia.org/wiki/International_Mobile_Station_Equipment_Identity
is_isbn: true # Validates an International Standard Book Number (ISBN). See: https://www.isbn-international.org/content/what-isbn
# Misc
is_version: true # Validates the string as version numbers using Semantic Versioning. Example: "1.2.3".
is_punct: true # Validates whether the input composed by only punctuation characters. Example: "!@#$%^&*()".
is_vowel: true # Validates whether the input contains only vowels. Example: "aei".
is_consonant: true # Validates if the input contains only consonants. Example: "bcd".
is_alnum: true # Validates whether the input is only alphanumeric. Example: "aBc123".
is_alpha: true # This is similar to `is_alnum`, but it does not allow numbers. Example: "aBc".
is_hex_rgb_color: true # Validates weather the input is a hex RGB color or not. Examples: "#FF0000", "#123", "ffffff", "fff".
# Check if the value is a valid hash. Supported algorithms:
# - md5, md4, md2, sha1, sha224, sha256, sha384, sha512/224, sha512/256, sha512
# - sha3-224, sha3-256, sha3-384, sha3-512, ripemd128, ripemd160, ripemd256, ripemd320, whirlpool, tiger128,3
# - tiger160,3, tiger192,3, tiger128,4, tiger160,4, tiger192,4, snefru, snefru256, gost, gost-crypto, crc32
# - crc32b, crc32c, adler32, fnv132, fnv1a32, fnv164, fnv1a64, joaat, murmur3a, murmur3c
# - murmur3f, xxh32, xxh64, xxh3, xxh128, haval128,3, haval160,3, haval192,3, haval224,3, haval256,3
# - haval128,4, haval160,4, haval192,4, haval224,4, haval256,4, haval128,5, haval160,5, haval192,5, haval224,5, haval256,5
hash: set_algo # Example: "1234567890abcdef".
# Check if a string is in a specific charset. Available charsets:
# - 7bit, 8bit, ASCII, ArmSCII-8, BASE64, BIG-5, CP850, CP866, CP932, CP936
# - CP950, CP50220, CP50221, CP50222, CP51932, EUC-CN, EUC-JP, EUC-JP-2004, EUC-KR, EUC-TW
# - GB18030, GB18030-2022, HTML-ENTITIES, HZ, ISO-2022-JP, ISO-2022-JP-2004, ISO-2022-JP-MOBILE#KDDI, ISO-2022-JP-MS, ISO-2022-KR, ISO-8859-1
# - ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5, ISO-8859-6, ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-8859-10, ISO-8859-13
# - ISO-8859-14, ISO-8859-15, ISO-8859-16, JIS, KOI8-R, KOI8-U, Quoted-Printable, SJIS, SJIS-2004, SJIS-Mobile#DOCOMO
# - SJIS-Mobile#KDDI, SJIS-Mobile#SOFTBANK, SJIS-mac, SJIS-win, UCS-2, UCS-2BE, UCS-2LE, UCS-4, UCS-4BE, UCS-4LE
# - UHC, UTF-7, UTF-8, UTF-8-Mobile#DOCOMO, UTF-8-Mobile#KDDI-A, UTF-8-Mobile#KDDI-B, UTF-8-Mobile#SOFTBANK, UTF-16, UTF-16BE, UTF-16LE
# - UTF-32, UTF-32BE, UTF-32LE, UTF7-IMAP, UUENCODE, Windows-1251, Windows-1252, Windows-1254, eucJP-win
charset: charset_code # Validates if a string is in a specific charset. Example: "UTF-8".
# Validates whether the input is a credit card number.
# Available credit card brands: "Any", "American Express", "Diners Club", "Discover", "JCB", "MasterCard", "Visa", "RuPay".
credit_card: Any # Example: "5376-7473-9720-8720"
####################################################################################################################
# Data validation for the entire(!) column using different data aggregation methods.
# Every rule is optional.
aggregate_rules:
preset: my-preset/login # Add preset aggregate rules for the column. See README.md for details.
is_unique: true # All values in the column are unique.
# Check if the column is sorted in a specific order.
# - Direction: ["asc", "desc"].
# - Method: ["numeric", "string", "natural", "regular"].
# See: https://www.php.net/manual/en/function.sort.php
sorted: [ asc, natural ] # Expected ascending order, natural sorting.
# First number in the column. Expected value is float or integer.
first_num_min: 1.0 # x >= 1.0
first_num_greater: 2.0 # x > 2.0
first_num_not: 5.0 # x != 5.0
first_num: 7.0 # x == 7.0
first_num_less: 8.0 # x < 8.0
first_num_max: 9.0 # x <= 9.0
first: Expected # First value in the column. Will be compared as strings.
first_not: Not expected # Not allowed as the first value in the column. Will be compared as strings.
# N-th value in the column.
# The rule expects exactly two arguments: the first is the line number (without header), the second is the expected value.
# Example: `[ 42, 5.0 ]` On the line 42 (disregarding the header), we expect the 5.0. The comparison is always as float.
nth_num_min: [ 42, 1.0 ] # x >= 1.0
nth_num_greater: [ 42, 2.0 ] # x > 2.0
nth_num_not: [ 42, 5.0 ] # x != 5.0
nth_num: [ 42, 7.0 ] # x == 7.0
nth_num_less: [ 42, 8.0 ] # x < 8.0
nth_num_max: [ 42, 9.0 ] # x <= 9.0
nth: [ 2, Expected ] # Nth value in the column. Will be compared as strings.
nth_not: [ 2, Not expected ] # Not allowed as the N-th value in the column. Will be compared as strings.
# Last number in the column. Expected value is float or integer.
last_num_min: 1.0 # x >= 1.0
last_num_greater: 2.0 # x > 2.0
last_num_not: 5.0 # x != 5.0
last_num: 7.0 # x == 7.0
last_num_less: 8.0 # x < 8.0
last_num_max: 9.0 # x <= 9.0
last: Expected # Last value in the column. Will be compared as strings.
last_not: Not expected # Not allowed as the last value in the column. Will be compared as strings.
# Sum of the numbers in the column. Example: [1, 2, 3] => 6.
sum_min: 1.0 # x >= 1.0
sum_greater: 2.0 # x > 2.0
sum_not: 5.0 # x != 5.0
sum: 7.0 # x == 7.0
sum_less: 8.0 # x < 8.0
sum_max: 9.0 # x <= 9.0
# Regular the arithmetic mean. The sum of the numbers divided by the count.
average_min: 1.0 # x >= 1.0
average_greater: 2.0 # x > 2.0
average_not: 5.0 # x != 5.0
average: 7.0 # x == 7.0
average_less: 8.0 # x < 8.0
average_max: 9.0 # x <= 9.0
# Total number of rows in the CSV file.
# Since any(!) values are taken into account, it only makes sense to use these rules once in any column.
count_min: 1 # x >= 1
count_greater: 2 # x > 2
count_not: 0 # x != 0
count: 7 # x == 7
count_less: 8 # x < 8
count_max: 9 # x <= 9
# Counts only empty values (string length is 0).
count_empty_min: 1 # x >= 1
count_empty_greater: 2 # x > 2
count_empty_not: 0 # x != 0
count_empty: 7 # x == 7
count_empty_less: 8 # x < 8
count_empty_max: 9 # x <= 9
# Counts only not empty values (string length is not 0).
count_not_empty_min: 1 # x >= 1
count_not_empty_greater: 2 # x > 2
count_not_empty_not: 0 # x != 0
count_not_empty: 7 # x == 7
count_not_empty_less: 8 # x < 8
count_not_empty_max: 9 # x <= 9
# Number of unique values.
count_distinct_min: 1 # x >= 1
count_distinct_greater: 2 # x > 2
count_distinct_not: 0 # x != 0
count_distinct: 7 # x == 7
count_distinct_less: 8 # x < 8
count_distinct_max: 9 # x <= 9
# Number of positive values.
count_positive_min: 1 # x >= 1
count_positive_greater: 2 # x > 2
count_positive_not: 0 # x != 0
count_positive: 7 # x == 7
count_positive_less: 8 # x < 8
count_positive_max: 9 # x <= 9
# Number of negative values.
count_negative_min: 1 # x >= 1
count_negative_greater: 2 # x > 2
count_negative_not: 0 # x != 0
count_negative: 7 # x == 7
count_negative_less: 8 # x < 8
count_negative_max: 9 # x <= 9
# Number of zero values. Any text and spaces (i.e. anything that doesn't look like a number) will be converted to 0.
count_zero_min: 1 # x >= 1
count_zero_greater: 2 # x > 2
count_zero_not: 0 # x != 0
count_zero: 7 # x == 7
count_zero_less: 8 # x < 8
count_zero_max: 9 # x <= 9
# Number of even values.
count_even_min: 1 # x >= 1
count_even_greater: 2 # x > 2
count_even_not: 0 # x != 0
count_even: 7 # x == 7
count_even_less: 8 # x < 8
count_even_max: 9 # x <= 9
# Number of odd values.
count_odd_min: 1 # x >= 1
count_odd_greater: 2 # x > 2
count_odd_not: 0 # x != 0
count_odd: 7 # x == 7
count_odd_less: 8 # x < 8
count_odd_max: 9 # x <= 9
# Number of prime values.
count_prime_min: 1 # x >= 1
count_prime_greater: 2 # x > 2
count_prime_not: 0 # x != 0
count_prime: 7 # x == 7
count_prime_less: 8 # x < 8
count_prime_max: 9 # x <= 9
# Calculate the median average of a list of numbers.
# See: https://en.wikipedia.org/wiki/Median
median_min: 1.0 # x >= 1.0
median_greater: 2.0 # x > 2.0
median_not: 5.0 # x != 5.0
median: 7.0 # x == 7.0
median_less: 8.0 # x < 8.0
median_max: 9.0 # x <= 9.0
# Harmonic mean (subcontrary mean). The harmonic mean can be expressed as the reciprocal of the arithmetic mean of the reciprocals.
# Appropriate for situations when the average of rates is desired.
# See: https://en.wikipedia.org/wiki/Harmonic_mean
harmonic_mean_min: 1.0 # x >= 1.0
harmonic_mean_greater: 2.0 # x > 2.0
harmonic_mean_not: 5.0 # x != 5.0
harmonic_mean: 7.0 # x == 7.0
harmonic_mean_less: 8.0 # x < 8.0
harmonic_mean_max: 9.0 # x <= 9.0
# Geometric mean. A type of mean which indicates the central tendency or typical value of a set of numbers
# by using the product of their values (as opposed to the arithmetic mean which uses their sum).
# See: https://en.wikipedia.org/wiki/Geometric_mean
geometric_mean_min: 1.0 # x >= 1.0
geometric_mean_greater: 2.0 # x > 2.0
geometric_mean_not: 5.0 # x != 5.0
geometric_mean: 7.0 # x == 7.0
geometric_mean_less: 8.0 # x < 8.0
geometric_mean_max: 9.0 # x <= 9.0
# Contraharmonic mean. A function complementary to the harmonic mean. A special case of the Lehmer mean, L₂(x), where p = 2.
# See: https://en.wikipedia.org/wiki/Contraharmonic_mean
contraharmonic_mean_min: 1.0 # x >= 1.0
contraharmonic_mean_greater: 2.0 # x > 2.0
contraharmonic_mean_not: 5.0 # x != 5.0
contraharmonic_mean: 7.0 # x == 7.0
contraharmonic_mean_less: 8.0 # x < 8.0
contraharmonic_mean_max: 9.0 # x <= 9.0
# Root mean square (quadratic mean) The square root of the arithmetic mean of the squares of a set of numbers.
# See: https://en.wikipedia.org/wiki/Root_mean_square
root_mean_square_min: 1.0 # x >= 1.0
root_mean_square_greater: 2.0 # x > 2.0
root_mean_square_not: 5.0 # x != 5.0
root_mean_square: 7.0 # x == 7.0
root_mean_square_less: 8.0 # x < 8.0
root_mean_square_max: 9.0 # x <= 9.0
# Trimean (TM, or Tukey's trimean).
# A measure of a probability distribution's location defined as a weighted average of the distribution's median and its two quartiles.
# See: https://en.wikipedia.org/wiki/Trimean
trimean_min: 1.0 # x >= 1.0
trimean_greater: 2.0 # x > 2.0
trimean_not: 5.0 # x != 5.0
trimean: 7.0 # x == 7.0
trimean_less: 8.0 # x < 8.0
trimean_max: 9.0 # x <= 9.0
# Cubic mean. See: https://en.wikipedia.org/wiki/Cubic_mean
cubic_mean_min: 1.0 # x >= 1.0
cubic_mean_greater: 2.0 # x > 2.0
cubic_mean_not: 5.0 # x != 5.0
cubic_mean: 7.0 # x == 7.0
cubic_mean_less: 8.0 # x < 8.0
cubic_mean_max: 9.0 # x <= 9.0
# Compute the P-th percentile of a list of numbers.
# Linear interpolation between closest ranks method - Second variant, C = 1 P-th percentile (0 <= P <= 100) of a list of N ordered values (sorted from least to greatest).
# Similar method used in NumPy and Excel.
# See: https://en.wikipedia.org/wiki/Percentile#Second_variant.2C_.7F.27.22.60UNIQ--postMath-00000043-QINU.60.22.27.7F
# Example: `[ 95.5, 1.234 ]` The 95.5th percentile in the column must be "1.234" (float).
percentile_min: [ 95.0, 1.0 ] # x >= 1.0
percentile_greater: [ 95.0, 2.0 ] # x > 2.0
percentile_not: [ 95.0, 5.0 ] # x != 5.0
percentile: [ 95.0, 7.0 ] # x == 7.0
percentile_less: [ 95.0, 8.0 ] # x < 8.0
percentile_max: [ 95.0, 9.0 ] # x <= 9.0
# Quartiles. Three points that divide the data set into four equal groups, each group comprising a quarter of the data.
# See: https://en.wikipedia.org/wiki/Quartile
# There are multiple methods for computing quartiles: ["exclusive", "inclusive"]. Exclusive is ussually classic.
# Available types: ["0%", "Q1", "Q2", "Q3", "100%", "IQR"] ("IQR" is Interquartile Range)
# Example: `[ inclusive, 'Q3', 42.0 ]` - the Q3 inclusive quartile is 42.0
quartiles_min: [ exclusive, '0%', 1.0 ] # x >= 1.0
quartiles_greater: [ inclusive, 'Q1', 2.0 ] # x > 2.0
quartiles_not: [ exclusive, 'Q2', 5.0 ] # x != 5.0
quartiles: [ inclusive, 'Q3', 7.0 ] # x == 7.0
quartiles_less: [ exclusive, '100%', 8.0 ] # x < 8.0
quartiles_max: [ inclusive, 'IQR', 9.0 ] # x <= 9.0
# Midhinge. The average of the first and third quartiles and is thus a measure of location.
# Equivalently, it is the 25% trimmed mid-range or 25% midsummary; it is an L-estimator.
# See: https://en.wikipedia.org/wiki/Midhinge
# Midhinge = (first quartile, third quartile) / 2
midhinge_min: 1.0 # x >= 1.0
midhinge_greater: 2.0 # x > 2.0
midhinge_not: 5.0 # x != 5.0
midhinge: 7.0 # x == 7.0
midhinge_less: 8.0 # x < 8.0
midhinge_max: 9.0 # x <= 9.0
# MAD - mean absolute deviation. The average of the absolute deviations from a central point.
# It is a summary statistic of statistical dispersion or variability.
# See: https://en.wikipedia.org/wiki/Average_absolute_deviation
mean_abs_dev_min: 1.0 # x >= 1.0
mean_abs_dev_greater: 2.0 # x > 2.0
mean_abs_dev_not: 5.0 # x != 5.0
mean_abs_dev: 7.0 # x == 7.0
mean_abs_dev_less: 8.0 # x < 8.0
mean_abs_dev_max: 9.0 # x <= 9.0
# MAD - median absolute deviation. The average of the absolute deviations from a central point.
# It is a summary statistic of statistical dispersion or variability.
# It is a robust measure of the variability of a univariate sample of quantitative data.
# See: https://en.wikipedia.org/wiki/Median_absolute_deviation
median_abs_dev_min: 1.0 # x >= 1.0
median_abs_dev_greater: 2.0 # x > 2.0
median_abs_dev_not: 5.0 # x != 5.0
median_abs_dev: 7.0 # x == 7.0
median_abs_dev_less: 8.0 # x < 8.0
median_abs_dev_max: 9.0 # x <= 9.0
# Population variance - Use when all possible observations of the system are present.
# If used with a subset of data (sample variance), it will be a biased variance.
# n degrees of freedom, where n is the number of observations.
population_variance_min: 1.0 # x >= 1.0
population_variance_greater: 2.0 # x > 2.0
population_variance_not: 5.0 # x != 5.0
population_variance: 7.0 # x == 7.0
population_variance_less: 8.0 # x < 8.0
population_variance_max: 9.0 # x <= 9.0
# Unbiased sample variance Use when only a subset of all possible observations of the system are present.
# n - 1 degrees of freedom, where n is the number of observations.
sample_variance_min: 1.0 # x >= 1.0
sample_variance_greater: 2.0 # x > 2.0
sample_variance_not: 5.0 # x != 5.0
sample_variance: 7.0 # x == 7.0
sample_variance_less: 8.0 # x < 8.0
sample_variance_max: 9.0 # x <= 9.0
# Standard deviation (For a sample; uses sample variance). It also known as SD or StdDev.
# StdDev is a measure that is used to quantify the amount of variation or dispersion of a set of data values.
# - Low standard deviation indicates that the data points tend to be close to the mean (also called the expected value) of the set.
# - High standard deviation indicates that the data points are spread out over a wider range of values.
# See: https://en.wikipedia.org/wiki/Standard_deviation
stddev_min: 1.0 # x >= 1.0
stddev_greater: 2.0 # x > 2.0
stddev_not: 5.0 # x != 5.0
stddev: 7.0 # x == 7.0
stddev_less: 8.0 # x < 8.0
stddev_max: 9.0 # x <= 9.0
# SD+ (Standard deviation for a population; uses population variance)
stddev_pop_min: 1.0 # x >= 1.0
stddev_pop_greater: 2.0 # x > 2.0
stddev_pop_not: 5.0 # x != 5.0
stddev_pop: 7.0 # x == 7.0
stddev_pop_less: 8.0 # x < 8.0
stddev_pop_max: 9.0 # x <= 9.0
# Coefficient of variation (cᵥ) Also known as relative standard deviation (RSD)
# A standardized measure of dispersion of a probability distribution or frequency distribution.
# It is often expressed as a percentage. The ratio of the standard deviation to the mean.
# See: https://en.wikipedia.org/wiki/Coefficient_of_variation
coef_of_var_min: 1.0 # x >= 1.0
coef_of_var_greater: 2.0 # x > 2.0
coef_of_var_not: 5.0 # x != 5.0
coef_of_var: 7.0 # x == 7.0
coef_of_var_less: 8.0 # x < 8.0
coef_of_var_max: 9.0 # x <= 9.0
# Interquartile mean (IQM). A measure of central tendency based on the truncated mean of the interquartile range.
# Only the data in the second and third quartiles is used (as in the interquartile range), and the lowest 25% and the highest 25% of the scores are discarded.
# See: https://en.wikipedia.org/wiki/Interquartile_mean
# Note: It's SUPER slow!!!
interquartile_mean_min: 1.0 # x >= 1.0
interquartile_mean_greater: 2.0 # x > 2.0
interquartile_mean_not: 5.0 # x != 5.0
interquartile_mean: 7.0 # x == 7.0
interquartile_mean_less: 8.0 # x < 8.0
interquartile_mean_max: 9.0 # x <= 9.0
- name: another_column
rules:
not_empty: true
- name: inherited_column_login
preset: my-preset/login
- name: inherited_column_full_name
preset: my-preset/full_name