Skip to content

Commit

Permalink
history tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
Marek Petrik committed Jan 4, 2025
1 parent 084c75c commit 4e392bf
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 80 deletions.
2 changes: 0 additions & 2 deletions LeanMDPs/Expected.lean
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,3 @@ theorem dp_correct_vf (π : PolicyHR M) (T : ℕ) (h : Hist M) :
| Nat.succ t =>
let hp h' := dp_correct_vf π t h'
sorry
--calc
-- value_π π T h =
12 changes: 6 additions & 6 deletions LeanMDPs/Finprob.lean
Original file line number Diff line number Diff line change
Expand Up @@ -224,15 +224,15 @@ variable {T₁ : Finset τ₁} {T₂ : Finset τ₂}

/-- Product of a probability distribution with a dependent probability
distributions is a probability distribution. -/
lemma prob_prod_prob (f : τ₁ → ℝ≥0) (g : τ₁ → τ₂ → ℝ≥0)
theorem prob_prod_prob (f : τ₁ → ℝ≥0) (g : τ₁ → τ₂ → ℝ≥0)
(h1 : ∑ t₁ ∈ T₁, f t₁ = 1)
(h2 : ∀ t₁ ∈ T₁, ∑ t₂ ∈ T₂, g t₁ t₂ = 1) :
∑ ⟨t₁,t₂⟩ ∈ (T₁ ×ˢ T₂), (f t₁) * (g t₁ t₂) = 1 :=
calc
∑ ⟨t₁,t₂⟩ ∈ (T₁ ×ˢ T₂), (f t₁)*(g t₁ t₂)
= ∑ t₁ ∈ T₁, ∑ t₂ ∈ T₂, (f t₁)*(g t₁ t₂) := by simp [Finset.sum_product]
_ = ∑ t₁ ∈ T₁, (f t₁) * (∑ t₂ ∈ T₂, (g t₁ t₂)) := by simp [Finset.sum_congr, Finset.mul_sum]
_ = ∑ t₁ ∈ T₁, (f t₁) := by simp_all [Finset.sum_congr, congrArg]
calc
∑ ⟨t₁,t₂⟩ ∈ (T₁ ×ˢ T₂), f t₁ * g t₁ t₂
= ∑ t₁ ∈ T₁, ∑ t₂ ∈ T₂, f t₁ * g t₁ t₂ := by simp only [Finset.sum_product]
_ = ∑ t₁ ∈ T₁, f t₁ * (∑ t₂ ∈ T₂, (g t₁ t₂)) := by simp only [Finset.mul_sum]
_ = ∑ t₁ ∈ T₁, f t₁ := by simp_all only [mul_one]
_ = 1 := h1

/--
Expand Down
128 changes: 86 additions & 42 deletions LeanMDPs/Histories.lean
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,27 @@ structure MDP (σ α : Type) : Type where
P : σ → α → Δ S -- TODO : change to S → A → Δ S
/-- reward function s, a, s' -/
r : σ → α → σ → ℝ -- TODO: change to S → A → S → ℝ

end Definitions

variable {M : MDP σ α}

section Histories

/-- Represents a history. The state is type σ and action is type α. -/
inductive Hist {σ α : Type} (M : MDP σ α) : Type where
| init : σ → Hist M
| prev : Hist M → α → σ → Hist M

/-- Coerces a state to a history -/
instance : Coe σ (Hist M) where
coe s := Hist.init s

/-- The length of the history corresponds to the zero-based step of the decision -/
def Hist.length : Hist M → ℕ
| init _ => 0
| prev h _ _ => 1 + length h


/-- Nonempty histories -/
abbrev HistNE {σ α : Type} (m : MDP σ α) : Type := {h : Hist m // h.length ≥ 1}

Expand All @@ -70,27 +77,31 @@ def Hist.append (h : Hist M) (as : α × σ) : Hist M :=

def tuple2hist : Hist M × α × σ → HistNE M
| ⟨h, as⟩ => ⟨h.append as, Nat.le.intro rfl⟩

def hist2tuple : HistNE M → Hist M × α × σ
| ⟨Hist.prev h a s, _ ⟩ => ⟨h, a, s⟩

open Function

/-- Proves that history append has a left inverse. -/
lemma linv_hist2tuple_tuple2hist :
Function.LeftInverse (hist2tuple (M := M)) tuple2hist := fun _ => rfl

lemma inj_tuple2hist_l1 : Function.Injective (tuple2hist (M:=M)) :=
Function.LeftInverse.injective linv_hist2tuple_tuple2hist
LeftInverse (hist2tuple (M := M)) tuple2hist := fun _ => rfl
lemma inj_tuple2hist_l1 : Injective (tuple2hist (M:=M)) :=
LeftInverse.injective linv_hist2tuple_tuple2hist
lemma inj_tuple2hist : Injective ((Subtype.val) ∘ (tuple2hist (M:=M))) :=
Injective.comp (Subtype.val_injective) inj_tuple2hist_l1

def emb_tuple2hist_l1 : Hist M × α × σ ↪ HistNE M := ⟨tuple2hist, inj_tuple2hist_l1⟩
def emb_tuple2hist : Hist M × α × σ ↪ Hist M := ⟨λ x ↦ tuple2hist x, inj_tuple2hist⟩

--- state
def state2hist (s : σ) : Hist M := Hist.init s
def hist2state : Hist M → σ | Hist.init s => s | Hist.prev _ _ s => s

lemma linv_hist2state_state2hist : LeftInverse (hist2state (M:=M)) state2hist := fun _ => rfl
lemma inj_state2hist : Injective (state2hist (M:=M)) :=
LeftInverse.injective linv_hist2state_state2hist

def state2hist_emb : σ ↪ Hist M := ⟨state2hist, inj_state2hist⟩

lemma inj_tuple2hist :
Function.Injective ((Subtype.val) ∘ (tuple2hist (M:=M))) :=
Function.Injective.comp (Subtype.val_injective) inj_tuple2hist_l1

/-- New history from a tuple. -/
def emb_tuple2hist_l1 : Hist M × α × σ ↪ HistNE M :=
{ toFun := tuple2hist, inj' := inj_tuple2hist_l1 }

def emb_tuple2hist : Hist M × α × σ ↪ Hist M :=
{ toFun := fun x => tuple2hist x, inj' := inj_tuple2hist }

/-- Checks if pre is the prefix of h. -/
def isprefix : Hist M → Hist M → Prop
Expand All @@ -106,19 +117,41 @@ def isprefix : Hist M → Hist M → Prop
else
(a₁ = a₂) ∧ (s₁' = s₂') ∧ (isprefix h₁ h₂)

/-- All histories of additional length T that follow history h -/
def Histories (h : Hist M) : ℕ → Finset (Hist M)
| Nat.zero => {h}
| Nat.succ t => ((Histories h t) ×ˢ M.A ×ˢ M.S).map emb_tuple2hist

abbrev ℋ : Hist M → ℕ → Finset (Hist M) := Histories

/-- All histories of a given length -/
def HistoriesHorizon : ℕ → Finset (Hist M)
| Nat.zero => M.S.map state2hist_emb
| Nat.succ t => ((HistoriesHorizon t) ×ˢ M.A ×ˢ M.S).map emb_tuple2hist

abbrev ℋₜ : ℕ → Finset (Hist M) := HistoriesHorizon

--theorem Histories

end Histories

section Policies

/-- Decision rule -/
def DecisionRule (M : MDP σ α) := σ → M.A

abbrev 𝒟 : MDP σ α → Type := DecisionRule

/-- A randomized history-dependent policy -/
def PolicyHR (M : MDP σ α) : Type := Hist M → Δ M.A
abbrev Phr : MDP σ α → Type := PolicyHR

/-- A deterministic Markov policy -/
def PolicyMD (M : MDP σ α) : Type := ℕ → σ → M.A
def PolicyMD (M : MDP σ α) : Type := ℕ → DecisionRule M
abbrev Pmd : MDP σ α → Type := PolicyMD

/-- A deterministic stationary policy -/
def PolicySD (M : MDP σ α) : Type := σ → M.A
def PolicySD (M : MDP σ α) : Type := DecisionRule M
abbrev Psd : MDP σ α → Type := PolicySD

instance [DecidableEq α] : Coe (PolicySD M) (PolicyHR M) where
Expand All @@ -127,35 +160,31 @@ instance [DecidableEq α] : Coe (PolicySD M) (PolicyHR M) where
instance [DecidableEq α] : Coe (PolicyMD M) (PolicyHR M) where
coe d := fun h ↦ dirac_dist M.A (d h.length h.last)

/-- Set of all histories of additional length T that follow history `h`. -/
def Histories (h : Hist M) : ℕ → Finset (Hist M)
| Nat.zero => {h}
| Nat.succ t => ((Histories h t) ×ˢ M.A ×ˢ M.S).map emb_tuple2hist
end Policies

abbrev ℋ : Hist M → ℕ → Finset (Hist M) := Histories
section Distribution

/-- Probability distribution over histories induced by the policy and
transition probabilities -/
def HistDist (h : Hist M) (π : PolicyHR M) (T : ℕ) : Δ (ℋ h T) :=
match T with
| Nat.zero => dirac_ofsingleton h
| Nat.succ t =>
let prev := HistDist h π t -- previous history
-- probability of the history
let f h' (as : α × σ) := ((π h').p as.1 * (M.P h'.last as.1).p as.2)
-- the second parameter below is the proof of being in Phist pre t; not used
let sumsto_as (h' : Hist M) _ : ∑ as ∈ M.A ×ˢ M.S, f h' as = 1 :=
let prev := HistDist h π t
let update h' (as : α × σ) := ((π h').p as.1 * (M.P h'.last as.1).p as.2)
let probability : Hist M → ℝ≥0
| Hist.init _ => 0 --ignored
| Hist.prev h' a s => prev.p h' * update h' ⟨a,s⟩
-- proof of probability
let sumsto_as (h' : Hist M) _ : ∑ as ∈ M.A ×ˢ M.S, update h' as = 1 :=
prob_prod_prob (π h').p (fun a =>(M.P h'.last a).p )
(π h').sumsto (fun a _ => (M.P h'.last a).sumsto)
let sumsto : ∑ ⟨h',as⟩ ∈ ((Histories h t) ×ˢ M.A ×ˢ M.S), prev.p h' * f h' as = 1 :=
prob_prod_prob prev.p f prev.sumsto sumsto_as
let HAS := ((Histories h t) ×ˢ M.A ×ˢ M.S).map emb_tuple2hist
let p : Hist M → ℝ≥0
| Hist.init _ => 0 --ignored
| Hist.prev h' a s => prev.p h' * f h' ⟨a,s⟩
have sumsto_fin : ∑ h' ∈ HAS, p h' = 1 :=
(Finset.sum_map ((Histories h t) ×ˢ M.A ×ˢ M.S) emb_tuple2hist p) ▸ sumsto
⟨p, sumsto_fin⟩
let sumsto : ∑ ⟨h',as⟩ ∈ ((ℋ h t) ×ˢ M.A ×ˢ M.S), prev.p h' * update h' as = 1 :=
prob_prod_prob prev.p update prev.sumsto sumsto_as
let HAS := ((ℋ h t) ×ˢ M.A ×ˢ M.S).map emb_tuple2hist
have sumsto_fin : ∑ h' ∈ HAS, probability h' = 1 :=
(Finset.sum_map ((ℋ h t) ×ˢ M.A ×ˢ M.S) emb_tuple2hist probability) ▸ sumsto
⟨probability, sumsto_fin⟩

abbrev Δℋ (h : Hist M) (π : PolicyHR M) (T : ℕ) : Finprob (Hist M) :=
⟨ℋ h T, HistDist h π T⟩
Expand All @@ -174,19 +203,34 @@ def reward : Hist M → ℝ
/-- The probability of a history -/
def prob_h (h : Hist M) (π : PolicyHR M) (T : ℕ) (h' : ℋ h T) : ℝ≥0 := (Δℋ h π T).2.p h'

--theorem hdist_eq_prod_pi_P (h : Hist M) (π : Π)

/- ----------- Expectations ---------------- -/


--variable {ρ : Type} [HMulZero ρ] [HMul ℕ ρ ρ] [AddCommMonoid ρ]

/-- Expectation over histories for a r.v. X for horizon T and policy π -/
def expect_h (h : Hist M) (π : PolicyHR M) (T : ℕ) (X : Hist M → ℝ) : ℝ :=
have P := Δℋ h π T
expect (⟨X⟩ : Finrv P ℝ)

notation "𝔼ₕ[" X "//" h "," π "," T "]" => expect_h h π T X
notation "𝔼ₕ[" X "//" h "," π "," t "]" => expect_h h π t X

/-- The k-th state of a history -/
def state [Inhabited σ] (k : ℕ) (h : Hist M) : σ :=
match h with
| Hist.init s => if h.length = k then s else Inhabited.default
| Hist.prev h' _ s => if h.length = k then s else state k h'


/-- The k-th state of a history -/
def action [Inhabited α] (k : ℕ) (h : Hist M) : α :=
match h with
| Hist.init _ => Inhabited.default
| Hist.prev h' a _ => if h.length = k then a else action k h'


#check List ℕ

end Distribution

/- Conditional expectation with future singletons -/
/-theorem hist_tower_property {hₖ : Hist m} {π : PolicyHR m} {t : ℕ} {f : Hist m → ℝ}
Expand Down
Loading

0 comments on commit 4e392bf

Please sign in to comment.