-
Notifications
You must be signed in to change notification settings - Fork 96
/
metaschema.1.schema.json
198 lines (198 loc) · 9.31 KB
/
metaschema.1.schema.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
{
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "http://jsonschema.net",
"type": "object",
"description": "Metaschema for validating the structure of the per-doctype schemas in this repository",
"properties": {
"mozPipelineMetadata": {
"type": "object",
"description": "Container for per-doctype metadata that can affect how pings are processed in the pipeline",
"additionalProperties": false,
"properties": {
"$comment": {
"type": "string",
"description": "Optional comment about the pipeline handling for this doctype"
},
"bq_table": {
"type": "string",
"description": "NOT YET IMPLEMENTED: The name of the destination BigQuery table"
},
"bq_dataset_family": {
"type": "string",
"description": "NOT YET IMPLEMENTED: The base name for the destination BigQuery dataset; if this is set to 'telemetry', the pipeline will write into 'telemetry_live'"
},
"bq_metadata_format": {
"type": "string",
"description": "The logical format for the metadata struct in the destination BigQuery table",
"enum": ["structured", "telemetry"]
},
"submission_timestamp_granularity": {
"type": "string",
"description": "If specified, the submission_timestamp field will be truncated to the specified granularity in the pipeline before being output to BigQuery; this can be used to reduce the potential for using time-based attacks to correlate datasets using different client-level identifiers; see Java's ChronoUnit for additional granularities that could be considered for inclusion; implemented for bug 1742172",
"enum": ["millis", "seconds", "minutes", "hours", "days"]
},
"expiration_policy": {
"type": "object",
"description": "Various options controlling data lifecycle",
"additionalProperties": false,
"properties": {
"delete_after_days": {
"type": "integer",
"description": "If present, a time_partitioning_expiration policy will be set on the destination stable table in BigQuery"
},
"collect_through_date": {
"type": "string",
"pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
"description": "If present, the pipeline will reject new data with submission_timestamp after the given date, sending it to error output"
}
}
},
"include_client_id": {
"type": "boolean",
"description": "Glean ping property that determines whether a client id is sent in the ping."
},
"include_info_sections": {
"type": "boolean",
"description": "Glean ping property that determines whether info sections are sent, e.g. client_info, ping_info."
},
"override_attributes": {
"type": "array",
"description": "Mappings of Pub/Sub attribute names to static values; these are applied in the Decoder immediately before incorporating metadata into the payload, so can be used to overwrite values calculated in the pipeline; a null value will cause the pipeline to drop the named attribute; some attribute names differ from the nested metadata format in BigQuery, so for example you must use \"geo_city\" here in order to manipulate the value that shows up as metadata.geo.city; implemented for bug 1742172",
"items": {
"type": "object",
"additionalProperties": false,
"properties": {
"name":{
"type": "string",
"enum": [
"geo_city",
"geo_subdivision1",
"geo_subdivision2",
"normalized_channel"
]
},
"value":{
"type": ["string", "null"]
}
},
"required": [
"name",
"value"
]
}
},
"geoip_skip_entries": {
"description": "If present, how many additional entries (beyond two) to skip in x_forwarded_for when performing geoip decoding, useful when submissions are ingested from trusted proxies; if there are fewer entries in x_forwarded_for than (N+1) the last entry is used instead of (N+3)rd-to-last",
"type": "integer"
},
"jwe_mappings": {
"type": "array",
"description": "Mappings of encrypted JWE field paths to destinations where the value decrypted by the pipeline should be placed; initial use case is Account Ecosystem Telemetry; paths must be in [JSON Pointer format](https://tools.ietf.org/html/rfc6901) like '/payload/ecosystemAnonId'",
"items": {
"type": "object",
"additionalProperties": false,
"properties": {
"source_field_path":{
"type": "string",
"pattern": "^/.*$"
},
"decrypted_field_path":{
"type": "string",
"pattern": "^/.*$"
}
},
"required": [
"source_field_path",
"decrypted_field_path"
]
}
},
"sample_id_source_uuid_attribute": {
"description": "If specified, sample_id will be calculated from a hash of the specified attribute if it is a valid UUID after removing curly brackets; if neither this nor sample_id_source_uuid_payload_path are specified, the client_id attribute will be used; implemented for DENG-547",
"type": "string"
},
"sample_id_source_uuid_payload_path": {
"description": "If specified and sample_id was not set due to sample_id_source_uuid_attribute, then sample_id will be calculated from a hash of the specified payload path; this is specified as a list strings to allow indicating nested fields, so for example metrics.uuid.legacy_ids_client_id would be specified as [\"metrics\",\"uuid\",\"legacy_ids_client_id\"]; if neither this nor sample_id_source_uuid_attribute are specified, the client_id attribute will be used; implemented for DENG-547",
"items": {
"type": "string"
},
"type": "array"
},
"split_config": {
"type": "object",
"description": "Configuration for splitting a ping into multiple pings by field",
"additionalProperties": false,
"properties": {
"preserve_original": {
"type": "boolean",
"description": "Whether or not to output the unmodified original ping in addition to any generated pings."
},
"remainder": {
"type": "object",
"description": "If present, generate a ping containing all fields not included in any subset ping.",
"additionalProperties": false,
"properties": {
"document_namespace": {
"type": "string"
},
"document_type": {
"type": "string"
},
"document_version": {
"type": "string"
}
},
"required": [
"document_namespace",
"document_type",
"document_version"
]
},
"subsets": {
"type": "array",
"description": "Array of subset pings to generate.",
"items": {
"type": "object",
"description": "Configuration for generating a ping that is a subset of fields from the original ping.",
"additionalProperties": false,
"properties": {
"document_namespace": {
"type": "string"
},
"document_type": {
"type": "string"
},
"document_version": {
"type": "string"
},
"pattern": {
"type": "string",
"description": "Regular expression matching .-delimited property names that should be moved to this subset ping. Only properties explictly defined in the non-generic json schema of the original ping are supported, because property names are matched during schema generation."
},
"extra_pattern": {
"type": "string",
"description": "Like pattern, except the schema of matched properties must also be present in the remainder, because schemas cannot delete fields. Data for matched properties will only go to this subset ping."
}
},
"required": [
"document_namespace",
"document_type",
"document_version",
"pattern"
]
}
}
},
"required": [
"preserve_original",
"subsets"
]
},
"json_object_path_regex": {
"description": "The path for which a JSON column will be enforced. This should be a regular expression which is used by the jsonschema-transpiler to match against the fully qualified name of a metric",
"type": "string"
}
}
}
}
}