forked from Quickblink/verML
-
Notifications
You must be signed in to change notification settings - Fork 2
/
FiniteHypClasses.thy
282 lines (227 loc) · 13.1 KB
/
FiniteHypClasses.thy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
\<^marker>\<open>creator Maximilian P. L. Haslbeck\<close>
theory FiniteHypClasses
imports "HOL-Probability.Probability" Pi_pmf LearningTheory
begin
paragraph \<open>Summary\<close>
text \<open>This theory proves that finite hypothesis classes are PAC learnable. This formalizes the
proof of corollary 2.3/corollary 3.2 of @{cite UnderstandingML}.\<close>
paragraph \<open>Main Theorems\<close>
text \<open>
\<^item> \<open>finiteHypothesisClass.corollary_2_3\<close>: every finite Hypothesis class is PAC learnable.
\<close>
section "auxiliary lemmas"
lemma fixes m :: nat and h \<delta> \<epsilon> :: real
assumes
nn: "h >0" and
epos: "\<epsilon> > 0" and m: "real m \<ge> (ln ( h / \<delta>)) / \<epsilon>"
and dd: "\<delta> > 0" "\<delta> < 1"
shows aux_estim: "h * exp (-\<epsilon> * m) \<le> \<delta>"
proof -
from m epos have "ln ( h / \<delta>) \<le> real m * \<epsilon>"
by (smt divide_cancel_right mult_imp_le_div_pos nonzero_mult_div_cancel_right)
then have "( h / \<delta>) \<le> exp (real m * \<epsilon>)" using dd nn
by (metis (full_types) divide_pos_pos exp_le_cancel_iff exp_ln of_nat_0_less_iff)
then have "h \<le> \<delta> * exp (real m * \<epsilon>)" using dd
using divide_divide_eq_left exp_gt_zero less_divide_eq_1_pos
by (simp add: mult.commute pos_divide_le_eq)
then have A: "h / exp (real m * \<epsilon>) \<le> \<delta>"
by (smt dd(1) divide_divide_eq_left exp_gt_zero less_divide_eq_1_pos mult.commute mult_pos_pos)
have B: "h / exp (real m * \<epsilon>) = h * exp (-\<epsilon> * m)"
proof -
have "h / exp (real m * \<epsilon>) = h * (exp 0) / exp (real m * \<epsilon>)"
by auto
also have "\<dots> = h * exp (0 - real m * \<epsilon>)" apply(subst exp_diff) by auto
also have "\<dots> = h * exp (- \<epsilon> * real m)" by auto
finally show ?thesis .
qed
from A B show ed: "h * exp (-\<epsilon> * m) \<le> \<delta>" by auto
qed
section "finite Hypothesis classes are PAC learnable"
(* now assume we have a finite class of Hypotheses *)
locale finiteHypothesisClass = learning_basics where X=X and Y=Y and H = H
for X::"'a set" and Y::"'b set" and H :: "('a \<Rightarrow> 'b) set" +
assumes fH: "finite H"
begin
text \<open>Let us now analyze the performance of the "Empirical Risk Minimization" rule
w.r.t. to the finite Hypothesis Class H.
Its learning rule ERMe chooses for a given training set S an Hypothesis h from H
which minimizes the Training Error. \<close>
lemma ERM_nonempty: "H\<noteq>{} \<Longrightarrow> ERM S n \<noteq> {}" unfolding ERM_def
by (simp add: ex_is_arg_min_if_finite fH)
text \<open>Now we show that the ERM rule is PAC learnable,
if we assume a finite hypotheses class H.\<close>
lemma fixes
D :: "('a\<times>'b) pmf"
and \<epsilon> \<delta> :: real
and m :: nat
assumes
epos: "\<epsilon> > 0" and m: "real m \<ge> (ln ( real (card H) / \<delta>)) / \<epsilon>"
and dd: "\<delta> > 0" "\<delta> < 1"
and DX: "set_pmf D \<subseteq> (X\<times>Y)"
and RealizabilityAssumption: "\<exists>h'\<in>H. PredErr D h' = 0"
shows corollary_2_3_aux: "measure_pmf.prob (Samples m D) {S. PredErr D (ERMe S m) \<le> \<epsilon>}
\<ge> 1 - \<delta>" (is "?LHS \<ge> 1 - \<delta>")
proof -
text \<open>Fixing some Distribution @{term D} we are
interested in upperbounding the probability to sample m-tuple of instances that
will lead to failure of the learner.
Formally we would like to upperbound @{term "measure_pmf.prob (Samples m D) {S. PredErr D (ERMe S m) \<le> \<epsilon>}"}.\<close>
text \<open>Let ?S be the carrier set of m-times labelled sample pairs (x,f x) from D.\<close>
let ?S = "set_pmf (Samples m D)"
text \<open>Let ?Hb be the set of "bad" hypotheses, that is,\<close>
let ?Hb = "{h\<in>H. PredErr D h > \<epsilon>}"
have fHb: "finite ?Hb" using fH by simp
from fH have cHb: "card ?Hb \<le> card H"
by (simp add: card_mono)
text \<open>In addition,\<close>
let ?M = "{S\<in>?S. \<exists>h\<in>?Hb. TrainErr S {0..<m} h = 0}"
text \<open>be the set of misleading samples: Namely, for every S in M, there is a "bad" hypothesis
h in H, that looks like a "good" hypothesis on S.\<close>
text \<open>We can upperbound the "bad" events (in which we are interested) by ?M:\<close>
have A: "{S\<in>?S. PredErr D (ERMe S m) > \<epsilon>} \<subseteq> ?M"
proof
from RealizabilityAssumption
obtain h' where h'H: "h'\<in>H" and u: "PredErr D h' = 0" by blast
from u have "measure_pmf.prob D {S \<in> set_pmf D. snd S \<noteq> h' (fst S)} = 0" unfolding PredErr_alt .
with measure_pmf_zero_iff[of D "{S \<in> set_pmf D. snd S \<noteq> h' (fst S)}"]
have correct: "\<And>x. x\<in>set_pmf D \<Longrightarrow> snd x = h' (fst x)" by blast
fix S
assume "S \<in> {S \<in>?S. \<epsilon> < PredErr D (ERMe S m)}"
then have SS: "S\<in>?S" and 2: "\<epsilon> < PredErr D (ERMe S m)" by auto
from SS set_Pi_pmf[where A="{0..<m}"]
have tD: "\<And>i. i\<in>{0..<m} \<Longrightarrow> S i \<in> set_pmf D"
unfolding Samples_def by auto
have z: "\<And>i. i\<in>{0..<m} \<Longrightarrow> (case S i of (x, y) \<Rightarrow> if h' x \<noteq> y then 1::real else 0) = 0"
using tD correct
by (simp add: case_prod_beta')
have Th'0: "TrainErr S {0..<m} h' = 0"
unfolding TrainErr_def using z
by fastforce
with h'H ERM_0_in have h'ERM: "h' \<in> ERM S m" by blast
then have "TrainErr S {0..<m} (ERMe S m) = 0" using Th'0 by(rule ERMe_minimal)
have "ERMe S m \<in> H" unfolding ERMe_def using ERM_subset ERM_nonempty
using nnH some_in_eq by blast
have "\<exists>h\<in>{h \<in> H. \<epsilon> < PredErr D h}. TrainErr S {0..<m} h = 0"
apply(rule bexI[where x="ERMe S m"])
apply auto by fact+
with SS show "S \<in> ?M" by auto
qed
text \<open>Note that we can rewrite ?M as\<close>
have prop_2_5: "?M = (\<Union>h\<in>?Hb. {S\<in>?S. TrainErr S {0..<m} h = 0})"
by auto
text \<open>Next, let us bound the probability of the preceding events for each of the inner sets.\<close>
have prop_2_9: "\<And>h. h\<in>?Hb \<Longrightarrow> measure_pmf.prob (Samples m D) ({S\<in>?S. TrainErr S {0..<m} h = 0}) \<le> exp (-\<epsilon> * m)"
proof -
text \<open>Fix some "bad" hypothesis h:?Hb.\<close>
fix h
assume "h\<in>?Hb"
then have z: "PredErr D h > \<epsilon>" by auto
(* magic ^^, `m>0` seems to follow from `real m \<ge> (ln ( real (card H) / \<delta>)) / \<epsilon>` *)
from nnH fH have "card H > 0" by auto
with m have mnn: "m>0" using epos dd
using fH leD real_of_nat_ge_one_iff by fastforce
from mnn have "finite {0..<m}" "{0..<m} \<noteq>{}" by auto
from TrainErr_correct[OF this] have Tc: "\<And>S h i. TrainErr S {0..<m} h = 0 \<Longrightarrow> i \<in> {0..<m} \<Longrightarrow> h (fst (S i)) = snd (S i)" .
text \<open>We first estimate the probability that the "bad" hypothesis makes a "good" prediction.\<close>
have individual_estim: "measure_pmf.prob D {x. snd x = h (fst x)} \<le> exp (-\<epsilon>)"
proof -
have "measure_pmf.prob D {x. snd x = h (fst x)} = 1 - PredErr D h" (is "?L = ?R")
proof -
have "?L = measure_pmf.prob D {S \<in> space (measure_pmf D). \<not> (S\<in>{x. snd x \<noteq> h (fst x)})}"
by auto
also have "\<dots> = 1 - measure_pmf.prob D {x \<in> space (measure_pmf D). x \<in> {x. snd x \<noteq> h (fst x)}}"
apply(rule measure_pmf.prob_neg) by simp
also have "\<dots> = ?R" unfolding PredErr_def by auto
finally show ?thesis .
qed
also have "\<dots> \<le> 1 - \<epsilon>" using z by force
(* using the inequality `1-\<epsilon> \<le> exp (-\<epsilon>)` *)
also have "\<dots> \<le> exp (-\<epsilon>)" using epos
by (metis add_uminus_conv_diff exp_ge_add_one_self)
finally show"measure_pmf.prob D {x. snd x = h (fst x)} \<le> exp (-\<epsilon>)" .
qed
\<comment> \<open>The event @{text "L\<^sub>S(h)=0"} is equivalent to the event @{text "\<forall>i. h(x\<^sub>i) = f(x\<^sub>i)"}\<close>
have "measure_pmf.prob (Samples m D) {S\<in>?S. TrainErr S {0..<m} h = 0}
\<le> measure_pmf.prob (Samples m D) {S\<in>?S. (\<forall>i\<in>{0..<m}. snd (S i) = h (fst (S i)))}"
by (auto simp add: Tc intro: measure_pmf.finite_measure_mono)
\<comment> \<open>Since the examples in the training set are sampled i.i.d. we get that:\<close>
also have "\<dots> \<le> measure_pmf.prob (Samples m D) (repeated_event m {(x,y). y = h x})" (* equality should also be correct, but \<le> takes less effort *)
proof (rule measure_pmf.finite_measure_mono, safe)
fix S
assume S: "S \<in> set_pmf (Samples m D)"
then have 1: "\<And>x. x \<notin> {0..<m} \<Longrightarrow> S x = undefined"
using set_Pi_pmf_subset[of "{0..<m}" undefined "(\<lambda>_. D)"] unfolding Samples_def
by blast
assume A: "\<forall>i\<in>{0..<m}. (snd (S i)) = h (fst (S i))"
{ fix i
assume i: "i \<in> {0..<m}"
with set_Pi_pmf[OF _ S[unfolded Samples_def]] have Si: "S i \<in> set_pmf D" by auto
fix x y assume "S i = (x,y)"
with Si
have "y = h x" using A i by force
} note 2=this
show "S \<in> repeated_event m {(x, y). y = h x}" unfolding repeated_event_def PiE_dflt_def apply safe by (fact 1 2)+
qed simp
also have "\<dots> = (measure_pmf.prob D {(x,y). y = h x}) ^ m"
by(rule iid)
(* also have "\<dots> = (measure_pmf.prob D {x. f x = h x}) ^ m"
by(simp only: reduce) *)
\<comment> \<open>by estimating each individual sampling we obtain\<close>
also have "\<dots> \<le> (exp (-\<epsilon>)) ^ m"
apply(rule power_mono)
apply (metis (mono_tags, lifting) Collect_cong case_prod_beta individual_estim)
by auto
also have "\<dots> \<le> exp (-\<epsilon> * m)"
by (metis exp_of_nat2_mult order_refl)
finally show "measure_pmf.prob (Samples m D) ({S\<in>?S. TrainErr S {0..<m} h = 0}) \<le> exp (-\<epsilon> * m)" .
qed
text "now we can plug all the estimations together:"
have "measure_pmf.prob (Samples m D) {S. PredErr D (ERMe S m) > \<epsilon>}
= measure_pmf.prob (Samples m D) {S\<in>?S. PredErr D (ERMe S m) > \<epsilon>}"
by (auto intro: pmf_prob_cong simp add: set_pmf_iff)
\<comment> \<open>the overapproximation by ?M:\<close>
also have "\<dots> \<le> measure_pmf.prob (Samples m D) ?M"
apply(rule measure_pmf.finite_measure_mono) apply (fact A) by auto
\<comment> \<open>the rewrite of ?M as a big Union:\<close>
also have "\<dots> = measure_pmf.prob (Samples m D) (\<Union>h\<in>?Hb. {S\<in>?S. TrainErr S {0..<m} h = 0})"
using prop_2_5 by auto
\<comment> \<open>bounding the probability of a Union of events by the sum of the probability of the events:\<close>
also have "\<dots> \<le> sum (\<lambda>h. measure_pmf.prob (Samples m D) ({S\<in>?S. TrainErr S {0..<m} h = 0})) ?Hb"
apply(rule measure_pmf.finite_measure_subadditive_finite) using fHb by auto
\<comment> \<open>applying the estimation:\<close>
also have "\<dots> \<le> sum (\<lambda>h. exp (-\<epsilon> * m)) ?Hb"
apply(rule sum_mono) using prop_2_9 by blast
\<comment> \<open>some final rewrites:\<close>
also have "\<dots> \<le> (card ?Hb) * exp (-\<epsilon> * m)" using fHb by auto
also have "\<dots> \<le> (card H) * exp (-\<epsilon> * m)" using cHb by simp
finally have k: "measure_pmf.prob (Samples m D) {S. PredErr D (ERMe S m) > \<epsilon>} \<le> (card H) * exp (-\<epsilon> * m)" .
text \<open>solve the bound for m for \<delta>\<close>
from nnH fH have nn: "0 < real (card H)" by fastforce
note bound = \<open>m \<ge> (ln ( card H / \<delta>)) / \<epsilon>\<close>
from aux_estim[OF nn epos bound dd]
have ed: "(card H) * exp (-\<epsilon> * m) \<le> \<delta>" by auto
text \<open>Put everything together to yield the final result:\<close>
have "1 - \<delta> \<le> 1 - (card H) * exp (-\<epsilon> * m)" using ed by simp
also have "\<dots> \<le> 1 - measure_pmf.prob (Samples m D) {S. PredErr D (ERMe S m) > \<epsilon>}"
using k by linarith
also have "\<dots> = ?LHS" (is "?R = _")
proof -
thm measure_pmf.prob_neg
have "?LHS = measure_pmf.prob (Samples m D) {S \<in> space (measure_pmf (Samples m D)). \<not> (S\<in>{S. \<epsilon> < PredErr D (ERMe S m)})}"
apply auto by (meson not_le)
also have "\<dots> = 1 - measure_pmf.prob (Samples m D) {x \<in> space (measure_pmf (Samples m D)). x \<in> {S. \<epsilon> < PredErr D (ERMe S m)}}"
apply(rule measure_pmf.prob_neg) by simp
also have "\<dots> = ?R" by auto
finally show ?thesis by simp
qed
finally show "measure_pmf.prob (Samples m D) {S. PredErr D (ERMe S m) \<le> \<epsilon>} \<ge> 1 - \<delta>" by simp
qed
definition "ERMbound \<epsilon> \<delta> = nat \<lceil> (ln ( real (card H) / \<delta>)) / \<epsilon>\<rceil>"
text \<open>Corollary 2.3 and 3.2\<close>
lemma corollary_2_3: "PAC_learnable ERMe"
unfolding PAC_learnable_def
apply(rule exI[where x="ERMbound"])
apply safe unfolding ERMbound_def
using corollary_2_3_aux by fastforce
end
end