Theory Source_Coding_Theorem

(* Title:       One Part of Shannon's Source Coding Theorem
   Author:      Quentin Hibon <qh225@cl.cam.ac.uk>, Lawrence Paulson <lp15@cam.ac.uk>, 2014
   Maintainer:  Quentin Hibon <qh225@cl.cam.ac.uk>
*)

theory Source_Coding_Theorem
imports "HOL-Probability.Information"
begin
section‹Basic types›

type_synonym bit = bool
type_synonym bword = "bit list"
type_synonym letter = nat
type_synonym 'b word = "'b list"

type_synonym 'b encoder = "'b word ⇒ bword"
type_synonym 'b decoder = "bword ⇒ 'b word option"

section‹Locale for the source coding theorem›
locale source_code = information_space +
  fixes fi :: "'b ⇒ real"
  fixes X :: "'a ⇒ 'b"

  assumes distr_i: "simple_distributed M X fi"
  assumes b_val: "b = 2"

  fixes enc::"'b encoder"
  fixes dec::"'b decoder"
  assumes real_code:
  "dec (enc x) = Some x"
  "enc w = [] ⟷ w = []"
  "x ≠ [] ⟶ enc x = enc [hd x] @ enc (tl x)"

section‹Source coding theorem, direct: the entropy is a lower bound of the code rate›
context source_code
begin
subsection‹The letter set›

definition L :: "'b set" where
  "L ≡ X ` space M"

lemma fin_L: "finite L"
    using L_def distr_i
    by auto

lemma emp_L: "L ≠ {}"
    using L_def subprob_not_empty
    by auto

subsection‹Codes and words›

abbreviation real_word :: "'b word ⇒ bool" where
  "real_word w ≡ (set w ⊆ L)"

abbreviation k_words :: "nat ⇒ ('b word) set" where
  "k_words k ≡ {w. length w = k ∧ real_word w}"

lemma rw_tail:
  assumes "real_word w"
shows "w = [] ∨ real_word (tl w)"
    by (meson assms list.set_sel(2) subset_code(1))

definition code_word_length :: "'e encoder ⇒ 'e ⇒ nat" where
  "code_word_length e l = length (e [l])"

abbreviation cw_len :: "'b ⇒ nat" where
  "cw_len l ≡ code_word_length enc l"

definition code_rate :: "'e encoder ⇒ ('a ⇒ 'e) ⇒ real" where
  "code_rate e Xo = expectation (λa. (code_word_length e ((Xo) a)))"

lemma fi_pos: "i∈ L ⟹ 0 ≤ fi i"
    using simple_distributed_nonneg[OF distr_i] L_def by auto

lemma (in prob_space) simp_exp_composed:
  assumes X: "simple_distributed M X Px"
shows "expectation (λa. f (X a)) = (∑x ∈ X`space M. f x * Px x)"
  using distributed_integral[OF simple_distributed[OF X], of f]
    simple_distributed_nonneg[OF X]
    lebesgue_integral_count_space_finite[OF simple_distributed_finite[OF X], of "λx. f x * Px x"]
  by (simp add:  ac_simps)

lemma cr_rw:
  "code_rate enc X = (∑i ∈ X ` space M. fi i * cw_len i)"
    using simp_exp_composed[OF distr_i, of "cw_len"]
    by (simp add: mult.commute code_rate_def)

abbreviation cw_len_concat :: "'b word ⇒ nat" where
  "cw_len_concat w ≡ foldr (λx s. (cw_len x) + s) w 0"

lemma cw_len_length: "cw_len_concat w = length (enc w)"
proof (induction w)
    case Nil
    show ?case using real_code by simp
    case (Cons a w)
    have "cw_len_concat (a # w) = cw_len a + cw_len_concat w" by simp
    thus ?case using code_word_length_def real_code Cons
      by (metis length_append list.distinct(1) list.sel(1) list.sel(3))
qed

lemma maj_fold:
  assumes "⋀l. l∈L ⟹ f l ≤ bound"
  assumes "real_word w"
shows "foldr (λx s. f x + s) w 0 ≤ length w * bound"
    using assms
    by(induction w) (simp,fastforce)

definition max_len :: "nat" where
  "max_len = Max ((λx. cw_len x) ` L)"

lemma max_cw:
  "l ∈ L ⟹ cw_len l ≤ max_len"
    by (simp add: max_len_def fin_L)

subsection‹Related to the Kraft theorem›
definition 𝒦 :: "real" where
  "𝒦 = (∑i∈L. 1 / b ^ (cw_len i))"

lemma pos_cw_len: "0 < 1 / b ^ cw_len i" using b_gt_1 by simp

lemma 𝒦_pos: "0 < 𝒦"
    using emp_L fin_L pos_cw_len sum_pos 𝒦_def
    by metis

lemma 𝒦_pow: "𝒦 = (∑i∈L. 1 / b powr cw_len i)"
    using powr_realpow b_gt_1
    by (simp add: 𝒦_def)

lemma k_words_rel:
  "k_words (Suc k) = {w. (hd w ∈ L ∧ tl w ∈ k_words k ∧ w ≠ [])}"
proof
    fix k
    show "k_words (Suc k) ⊆ {w. (hd w ∈ L ∧ tl w ∈ k_words k ∧ w ≠ [] )}" (is "?l ⊆ ?r")
  proof
      fix w
      assume w_kw: "w ∈ k_words (Suc k)"
      hence "real_word w" by simp
      hence "hd w ∈ L"
        by (metis (mono_tags) w_kw hd_in_set list.size(3) mem_Collect_eq nat.distinct(1) subset_code(1))
      moreover have "length w = Suc k" using w_kw by simp
      moreover hence "w ≠ []" by auto
      moreover have "real_word (tl w)" using ‹real_word w› calculation(3) rw_tail by auto
      ultimately show "w ∈ ?r" using w_kw by simp
  qed
next
    fix k
    show "k_words (Suc k) ⊇ {w. (hd w ∈ L ∧ tl w ∈ k_words k ∧ w ≠ [])}"
  proof
      fix w
      assume asm: "w ∈ {w. hd w ∈ L ∧ tl w ∈ {w. length w = k ∧ real_word w} ∧ w ≠ []}"
      hence "hd w ∈ L ∧ length (tl w) = k ∧ real_word (tl w)" by simp
      hence "real_word w"
        by (metis empty_iff insert_subset list.collapse list.set(1) set_simps(2) subsetI)
      moreover hence "length w = Suc k" using asm by auto
      ultimately show "w ∈ k_words (Suc k)" by simp
  qed
qed

lemma bij_k_words:
shows "bij_betw (λwi. Cons (fst wi) (snd wi)) (L × k_words k) (k_words (Suc k))"
    unfolding bij_betw_def
proof
    fix k
    let ?f = "(λwi. Cons (fst wi) (snd wi))"
    let ?S = "L × (k_words k)"
    let ?T = "k_words (Suc k)"
    show "inj_on ?f ?S" by (simp add: inj_on_def)
    show "?f`?S = ?T"
  proof (rule ccontr)
      assume "?f ` ?S ≠ ?T"
      hence "∃w. w∈ ?T ∧ w ∉ ?f`?S" by auto
      then obtain w where asm: "w∈ ?T ∧ w ∉ ?f`?S" by blast
      hence "w = ?f (hd w,tl w)" using k_words_rel by simp
      moreover have "(hd w,tl w) ∈ ?S" using k_words_rel asm by simp
      ultimately have "w ∈ ?f`?S" by blast
      thus "False" using asm by simp
  qed
qed

lemma finite_k_words: "finite (k_words k)"
proof (induct k)
    case 0
    show ?case by simp
    case (Suc n)
    thus ?case using bij_k_words bij_betw_finite fin_L by blast
qed

lemma cartesian_product:
  fixes f::"('c ⇒ real)"
  fixes g::"('d ⇒ real)"
  assumes "finite A"
  assumes "finite B"
shows "(∑b∈B. g b) * (∑a∈A. f a) = (∑ab∈A×B. f (fst ab) * g (snd ab))"
    using bilinear_times bilinear_sum[where h="(λx y. x * y)" and f="f" and g="g"] assms
    by (metis (erased, lifting) sum.cong split_beta' Groups.ab_semigroup_mult_class.mult.commute)

lemma 𝒦_power:
shows "𝒦^k = (∑w ∈ (k_words k). 1 / b^(cw_len_concat w))"
proof (induct k)
    case 0
    have "k_words 0 = {[]}" by auto
    thus ?case by simp
next
    case (Suc n)
    have " 𝒦 ^Suc n = 𝒦 ^n * 𝒦 " by simp
    also have "… = (∑w ∈ k_words n. 1 / b^cw_len_concat w) * (∑i∈L. 1 / b^cw_len i)"
      using Suc.hyps 𝒦_def by auto
    also have "… = (∑wi ∈ L × k_words n. 1/b^cw_len (fst wi) * (1 / b^cw_len_concat (snd wi)))"
      using fin_L finite_k_words cartesian_product
      by blast
    also have "… = (∑wi ∈ L × k_words n. 1 / b^(cw_len_concat (snd wi) + cw_len (fst wi)))"
      by (metis (no_types, lifting) power_add add.commute power_one_over)
    also have "… = (∑wi ∈ L × k_words n. 1 / b^cw_len_concat (fst wi # snd wi))"
      by (metis (erased, lifting) add.commute comp_apply foldr.simps(2))
    also have "… = (∑w ∈ (k_words (Suc n)). 1 / b^(cw_len_concat w))"
      using sum.reindex_bij_betw [OF bij_k_words] by fastforce
    finally show ?case by simp
qed

lemma bound_len_concat:
shows "w ∈ k_words k ⟹ cw_len_concat w ≤ k * max_len"
    using max_cw maj_fold by blast

subsection‹Inequality of the kraft sum (source coding theorem, direct)›
subsubsection‹Sum manipulation lemmas and McMillan theorem›

lemma sum_vimage_proof:
  fixes g::"nat ⇒ real"
  assumes "⋀w. f w < bd"
shows "finite S ⟹ (∑w∈S. g (f w)) = (∑ m=0..<bd. (card ((f-`{m}) ∩ S) )* g m)"
(is "_ ⟹ _ = (∑ m=0..<bd. ?ff m S)")
proof (induct S rule: finite_induct)
    case empty
    show ?case by simp
next
    case (insert x F)
    let ?rr = "(∑m = 0..<bd. ?ff m (insert x F))"
    have "(f x) ∈ {0..<bd}" using assms by simp
    hence "⋀h::(nat ⇒ real). (∑m=0..<bd. h m) = (∑y∈({0..<bd} - {f x}).h y) + h (f x)"
      by (metis diff_add_cancel finite_atLeastLessThan sum_diff1)
    moreover hence
    "(∑m = 0..<bd. ?ff m (insert x F))
    = (∑m∈{0..<bd} - {f x}. ?ff m (insert x F)) + card (f -` {f x} ∩ F) * g (f x) + g (f x)"
      by (simp add: semiring_normalization_rules(2), simp add: insert)
    ultimately have "(∑m = 0..<bd. ?ff m (insert x F)) = (∑m∈{0..<bd}. ?ff m F) + g (f x)"
      by fastforce
    thus ?case using insert by simp
qed

lemma sum_vimage:
  fixes g::"nat ⇒ real"
  assumes bounded: "⋀w. w ∈ S ⟹ f w < bd" and "0 < bd"
  assumes finite: "finite S"
shows "(∑w∈S. g (f w)) = (∑ m=0..<bd. (card ((f-`{m}) ∩ S) ) * g m)"
(is "?s1 = ?s2")
proof -
    let ?ff = "(λx. if x∈S then f x else 0)"
    let ?ss1 = "(∑w∈S. g (?ff w))"
    let ?ss2 = "(∑ m=0..<bd. (card ((?ff-`{m}) ∩ S) ) * g m)"
    have "?s1 =?ss1" by simp
    moreover have"⋀m. ?ff -`{m} ∩ S = f-`{m} ∩ S" by auto
    moreover hence "?s2 = ?ss2" by simp
    moreover have "⋀w . ?ff w < bd" using assms by simp
    moreover hence "?ss1 = ?ss2" using sum_vimage_proof[of "?ff"] finite by blast
    ultimately show "?s1 = ?s2" by metis
qed

lemma 𝒦_rw:
  "(∑w ∈ (k_words k). 1 / b^(cw_len_concat w)) = (∑m=0..<Suc (k*max_len). card (k_words k ∩
((cw_len_concat) -` {m})) * (1 / b^m))" (is "?L = ?R")
proof -
    have "⋀w. w ∈ k_words k ⟹ cw_len_concat w < Suc ( k * max_len)"
      by (simp add: bound_len_concat le_imp_less_Suc)
    moreover have
    "?R = (∑m = 0..<Suc (k * max_len).
  (card (cw_len_concat -` {m} ∩ k_words k)) * (1 / b ^ m))"
      by (metis Int_commute)
    moreover have "0 < Suc (k*max_len)" by simp
    ultimately show ?thesis
      using finite_k_words
    sum_vimage[where f="cw_len_concat" and g = "λi. 1/ (b^i)"]
      by fastforce
qed

definition set_of_k_words_length_m :: "nat ⇒ nat ⇒ 'b word set" where
  "set_of_k_words_length_m k m = {xk. xk ∈ k_words k} ∩ (cw_len_concat)-`{m}"

lemma am_inj_code: "inj_on enc ((cw_len_concat)-`{m})" (is "inj_on _ ?s")
  using inj_on_def[of enc "?s"] real_code
  by (metis option.inject)

lemma img_inc: "enc`cw_len_concat-`{m} ⊆ {bl. length bl = m}" using cw_len_length by auto

lemma bool_lists_card: "card {bl::bool list. length bl = m} = b^m"
 using card_lists_length_eq[of "UNIV::bool set"]
  by (simp add: b_val)

lemma bool_list_fin: "finite {bl::bool list. length bl = m}"
  using finite_lists_length_eq[of "UNIV::bool set"]
  by (simp add: b_val)

lemma set_of_k_words_bound:
shows "card (set_of_k_words_length_m k m) ≤ b^m" (is "?c ≤ ?b")
proof -
    have card_w_len_m_bound: "card (cw_len_concat-`{m}) ≤ b^m"
      by (metis (no_types, lifting) am_inj_code bool_list_fin bool_lists_card card_image card_mono
    img_inc of_nat_le_iff)
    have "set_of_k_words_length_m k m ⊆ (cw_len_concat)-`{m}"
      by (simp add: set_of_k_words_length_m_def)
    hence "card (set_of_k_words_length_m k m) ≤ card ((cw_len_concat)-`{m})"
      by (metis (no_types, lifting) am_inj_code bool_list_fin card.infinite card_0_eq
    card_image card_mono empty_iff finite_subset img_inc inf_img_fin_dom)
    thus ?thesis using card_w_len_m_bound by simp
qed

lemma empty_set_k_words:
  assumes "0 < k"
shows "set_of_k_words_length_m k 0 = {}"
proof(rule ccontr)
    assume "¬ set_of_k_words_length_m k 0 = {}"
    hence "∃x. x ∈ set_of_k_words_length_m k 0" by auto
    then obtain x where x_def: "x ∈ set_of_k_words_length_m k 0" by auto
    hence "x ≠ []" unfolding set_of_k_words_length_m_def using assms by auto
    moreover have "cw_len_concat (hd x#tl x) = cw_len_concat (tl x) + cw_len (hd x)"
      by (metis add.commute comp_apply foldr.simps(2))
    moreover have "enc [(hd x)] ≠ []" using assms real_code by blast
    moreover hence "0 < cw_len (hd x)" unfolding code_word_length_def by simp
    ultimately have "x ∉ set_of_k_words_length_m k 0" by (simp add:set_of_k_words_length_m_def)
    thus "False" using x_def by simp
qed

lemma 𝒦_rw2:
  assumes "0 < k"
shows "(∑m=0..<Suc (k * max_len). card (set_of_k_words_length_m k m)/ b^m) ≤ (k * max_len)"
proof -
    have
    "(∑m=1..<Suc (k * max_len). card (set_of_k_words_length_m k m) / b^m)
    ≤ (∑m=1..<Suc(k * max_len). b^m / b^m)"
      using set_of_k_words_bound b_val
    Groups_Big.sum_mono[of "{1..<Suc(k * max_len)}"
    "(λm. (card (set_of_k_words_length_m k m))/b^m)" "λm. b^m /b^m"]
      by simp
    moreover have"(∑m=1..<Suc(k * max_len). b^m / b^m) = (∑m=1..<Suc(k *max_len). 1)"
      using b_gt_1 by simp
    moreover have "(∑m=1..<Suc(k * max_len). 1) = (k * max_len)"
      by simp
    ultimately have
    "(∑m = 1..<Suc (k * max_len). card (set_of_k_words_length_m k m) / b ^ m) ≤ k * max_len"
      by (metis One_nat_def card_atLeastLessThan card_eq_sum diff_Suc_Suc real_of_card)
    thus ?thesis using empty_set_k_words assms
      by (simp add: sum_shift_lb_Suc0_0_upt split: if_split_asm)
qed

lemma 𝒦_power_bound :
  assumes "0 < k"
shows " 𝒦^k ≤ k * max_len"
    using assms 𝒦_power 𝒦_rw 𝒦_rw2
    by (simp add: set_of_k_words_length_m_def)

theorem McMillan :
shows "𝒦 ≤ 1"
proof -
    have ineq: "⋀k. 0 < k ⟹ 𝒦 ≤ root k k * root k max_len"
      using 𝒦_pos 𝒦_power_bound
      by (metis (no_types, opaque_lifting) not_less of_nat_0_le_iff of_nat_mult power_strict_mono real_root_mult real_root_pos_pos_le real_root_pos_unique real_root_power)
    hence "0 < max_len ⟹ (λk. root k k * root k max_len) ⇢ 1"
      by (auto intro!: tendsto_eq_intros LIMSEQ_root LIMSEQ_root_const)
    moreover have "∀n≥1. 𝒦 ≤ root n n * root n max_len"
      using ineq by simp
    moreover have "max_len = 0 ⟹ 𝒦 ≤ 1" using ineq by fastforce
    ultimately show " 𝒦 ≤ 1" using LIMSEQ_le_const by blast
qed

lemma entropy_rw: "ℋ(X) = -(∑i ∈ L. fi i * log b (fi i))"
    using entropy_simple_distributed[OF distr_i]
    by (simp add: L_def)

subsubsection‹Technical lemmas about the logarithm›
lemma log_mult_ext3:
  "0 ≤ x ⟹ 0 < y ⟹ 0 < z ⟹ x * log b (x*y*z) = x * log b (x*y) + x * log b z"
  by (metis dual_order.irrefl log_mult ring_distribs(1) mult_eq_0_iff)

lemma log_mult_ext2:
  "0 ≤ x ⟹ 0 < y ⟹ x * log b (x*y) = x * log b x + x * log b y"
    using log_mult_ext3[where y=1] by simp

subsubsection ‹KL divergence and properties›
definition KL_div ::"'b set ⇒ ('b ⇒ real) ⇒ ('b ⇒ real) ⇒ real" where
  "KL_div S a d = (∑ i ∈ S. a i * log b (a i / d i))"

lemma KL_div_mul:
  assumes "0 < d" "d ≤ 1"
  assumes "⋀i. i∈S ⟹ 0 ≤ a i"
  assumes "⋀i. i∈S ⟹ 0 < e i"
shows "KL_div S a e ≥ KL_div S a (λi. e i / d)"
    unfolding KL_div_def
proof -
    {
    fix i
    assume "i∈S"
    hence "a i / (e i / d) ≤ a i / e i" using assms
      by (metis (no_types) div_by_1 frac_le less_imp_triv not_less)
    hence "log b (a i / (e i / d)) ≤ log b (a i / e i)" using assms
      by (smt (verit, best) Transcendental.log_mono ‹i ∈ S› b_gt_1 diff_divide_distrib divide_pos_pos)
    }
    thus "(∑i∈S. a i * log b (a i / (e i / d))) ≤ (∑i∈S. a i * log b (a i / e i))"
      by (meson mult_left_mono assms sum_mono)
qed

lemma KL_div_pos:
  fixes a e::"'b ⇒ real"
  assumes fin: "finite S"
  assumes nemp: "S ≠ {}"
  assumes non_null: "⋀i. i∈S ⟹ 0 < a i" "⋀i. i∈ S ⟹ 0 < e i"
  assumes sum_a_one: "(∑ i ∈ S. a i) = 1"
  assumes sum_c_one: "(∑ i ∈ S. e i) = 1"
shows "0 ≤ KL_div S a e"
    unfolding KL_div_def
proof -
    let ?f = "λi. e i / a i"
    have f_pos: "⋀i. i∈S ⟹ 0 < ?f i"
      using non_null
      by simp
    have a_pos: "⋀i. i∈ S ⟹ 0 ≤ a i"
      using non_null
      by (simp add: order.strict_implies_order)
    have "- log b (∑i∈S. a i * e i / a i) ≤ (∑i∈S. a i * - log b (e i / a i))"
      using convex_on_sum[OF fin nemp  minus_log_convex[OF b_gt_1] 
                             sum_a_one a_pos, of "λi. e i / a i"] f_pos by simp
    also have "-log b (∑i∈S. a i * e i / a i) = -log b (∑i∈S. e i)"
  proof -
      from non_null(1) have "⋀i. i ∈ S ⟹ a i * e i / a i = e i" by force
      thus ?thesis by simp
  qed
    finally have "0 ≤ (∑i∈S. a i * - log b (e i / a i))"
      by (simp add: sum_c_one)
    thus "0 ≤ (∑i∈S. a i * log b (a i / e i))"
      by (smt (verit, best) b_gt_1 log_divide non_null sum_mono)
qed

lemma KL_div_pos_emp:
  "0 ≤ KL_div {} a e" by (simp add: KL_div_def)

lemma KL_div_pos_gen:
  fixes a d::"'b ⇒ real"
  assumes fin: "finite S"
  assumes non_null: "⋀i. i∈S ⟹ 0 < a i" "⋀i. i∈ S ⟹ 0 < d i"
  assumes sum_a_one: "(∑ i ∈ S. a i) = 1"
  assumes sum_d_one: "(∑ i ∈ S. d i) = 1"
shows "0 ≤ KL_div S a d"
    using KL_div_pos KL_div_pos_emp assms by metis

theorem KL_div_pos2:
  fixes a d::"'b ⇒ real"
  assumes fin: "finite S"
  assumes non_null: "⋀i. i∈S ⟹ 0 ≤ a i" "⋀i. i∈ S ⟹ 0 < d i"
  assumes sum_a_one: "(∑ i ∈ S. a i) = 1"
  assumes sum_c_one: "(∑ i ∈ S. d i) = 1"
shows "0 ≤ KL_div S a d"
proof -
    have "S = (S ∩ {i. 0 < a i}) ∪ (S ∩ {i. 0 = a i})" using non_null(1) by fastforce
    moreover have "(S ∩ {i. 0 < a i}) ∩ (S ∩ {i. 0 = a i}) = {}" by auto
    ultimately have
    eq: "KL_div S a d = KL_div (S ∩ {i. 0 < a i}) a d + KL_div (S ∩ {i. 0 = a i}) a d"
      unfolding KL_div_def
      by (metis (mono_tags, lifting) fin finite_Un sum.union_disjoint)
    have "KL_div (S ∩ {i. 0 = a i}) a d = 0" unfolding KL_div_def by simp
    hence "KL_div S a d = KL_div (S ∩ {i. 0 < a i}) a d" using eq by simp
    moreover have "0 ≤ KL_div (S ∩ {i. 0 < a i}) a d"
  proof(cases "(S ∩ {i. 0 < a i}) = {}")
      case True
      thus ?thesis unfolding KL_div_def by simp
  next
      case False
      let ?c = "λi. d i / (∑j ∈(S ∩ {i. 0 < a i}). d j)"
      have 1: "(⋀i. i ∈ S ∩ {i. 0 < a i} ⟹ 0 < a i)" by simp
      have 2: "(⋀i. i ∈ S ∩ {i. 0 < a i} ⟹ 0 < ?c i)"
        by (metis False IntD1 divide_pos_pos fin finite_Int non_null(2) sum_pos)
      have 3: "(∑i∈ (S ∩ {i. 0 < a i}). a i) = 1"
        using sum.cong[of S, of S, of "(λx. if x ∈ {i. 0 < a i} then a x else 0)", of a]
      sum.inter_restrict[OF fin, of a] non_null(1) sum_a_one
        by fastforce
      have "(∑i∈S ∩ {j. 0 < a j}. ?c i) = (∑i∈S ∩ {j. 0 < a j}. d i) / (∑i∈S ∩ {j. 0 < a j}. d i)"
        by (metis sum_divide_distrib)
      hence 5: "(∑i∈S ∩ {j. 0 < a j}. ?c i) = 1" using 2 False by force
      hence "0 ≤ KL_div (S ∩ {j. 0 < a j}) a ?c"
        using KL_div_pos_gen[
      OF finite_Int[OF disjI1, of S, of "{j. 0 < a j}"], of a, of ?c
      ] 1 2 3
        by (metis fin)
      have fstdb: "0 < (∑i∈S ∩ {i. 0 < a i}. d i)" using non_null(2) False
        by (metis Int_Collect fin finite_Int sum_pos)
      have 6: "0 ≤ KL_div (S ∩ {i. 0 < a i}) a (λi. d i / (∑i∈(S ∩ {i. 0 < a i}). d i))"
        using 2 3 5
        KL_div_pos_gen[
      OF finite_Int[OF disjI1, OF fin], of "{i. 0 < a i}", of "a", of "?c"
      ]
        by simp
      hence
      "KL_div (S ∩ {j. 0 < a j}) a (λi. d i / (∑i∈(S ∩ {i. 0 < a i}). d i)) ≤ KL_div (S ∩ {j. 0 < a j}) a d"
        using non_null sum.inter_restrict[OF fin, of d, of "{i. 0 < a i}"]
        sum_mono[of S, of "(λx. if x ∈ {i. 0 < a i} then d x else 0)", of d] non_null(2) sum_c_one
        non_null(2) fstdb KL_div_mul
        by force
      moreover have "0 ≤ KL_div (S ∩ {j. 0 < a j}) a (λi. d i / (∑i∈(S ∩ {i. 0 < a i}). d i))"
        using KL_div_pos_gen[ OF finite_Int[OF disjI1, OF fin]] using 2 3 5 by fastforce
      ultimately show "0 ≤ KL_div (S ∩ {j. 0 < a j}) a d" by simp
  qed
    ultimately show ?thesis by simp
qed

lemma sum_div_1:
  fixes f::"'b ⇒ 'c::field"
  assumes "(∑i∈A. f i) ≠ 0"
shows "(∑i∈A. f i / (∑j∈A. f j)) = 1"
    by (metis (no_types) assms right_inverse_eq sum_divide_distrib)

theorem rate_lower_bound:
shows "ℋ(X) ≤ code_rate enc X"
proof -
    let ?cr = "code_rate enc X"
    let ?r = "(λi. 1 / ((b powr cw_len i) * 𝒦))"
    have pos_pi: "⋀i. i ∈ L ⟹ 0 ≤ fi i" using fi_pos by simp
    {
    fix i
    assume "i ∈ L"
    hence
    "fi i * (log b (1 / (1 / b powr (cw_len i))) + log b (fi i))
    = fi i * log b (fi i / (1 / b powr (cw_len i)))"
      using log_mult_ext2 [OF pos_pi, of i] b_gt_1
      by simp (simp add: algebra_simps)
    }
    hence eqpi:
    "⋀i. i∈ L ⟹ fi i * (log b (1 / (1 / b powr (cw_len i))) + log b (fi i))
    = fi i * log b (fi i / (1 / b powr (cw_len i)))"
      by simp
    have sum_one_L: "(∑ i ∈ L. fi i) = 1"
      using simple_distributed_sum_space[OF distr_i] by (simp add: L_def)
    {
    fix i
    assume "i ∈ L"
    hence h1: "0 ≤ fi i" using pos_pi by blast
    have h2: "0 < 𝒦 / (1/b powr cw_len i)" using b_gt_1 𝒦_pos by auto
    have h3: "0 < 1 / 𝒦" using 𝒦_pos by simp
    have
    "fi i * log b (fi i * 𝒦 / (1/b powr cw_len i) * (1/ 𝒦)) =
    fi i * log b (fi i * 𝒦 / (1/b powr cw_len i)) + fi i * log b (1/ 𝒦)"
      using log_mult_ext3[OF h1 h2 h3]
      by (metis times_divide_eq_right)
    } hence big_eq:
    "⋀i. i ∈ L ⟹ fi i * log b (fi i * 𝒦 / (1/b powr cw_len i) * (1 / 𝒦)) =
    fi i * log b (fi i * 𝒦 / (1/b powr cw_len i)) + fi i * log b (1 / 𝒦)"
      by (simp add: inverse_eq_divide)
    have 1: "?cr - ℋ(X) = (∑i ∈ L. fi i * cw_len i) + (∑i ∈ L. fi i * log b (fi i))"
      using 𝒦_def entropy_rw cr_rw L_def by simp
    also have 2: "(∑i∈L. fi i * cw_len i) = (∑i ∈ L. fi i * (-log b (1/(b powr (cw_len i)))))"
      using b_gt_1 log_divide by simp
    also have "… = -1 * (∑i ∈ L. fi i * (log b (1/(b powr (cw_len i)))))"
      using sum_distrib_left[of "-1" "(λi. fi i * (- 1 * log b (1 / b powr (cw_len i))))" L]
      by simp
    finally have
    "?cr - ℋ(X) = -(∑i ∈ L. fi i * log b (1/b powr cw_len i)) + (∑i ∈ L. fi i * log b (fi i))"
      by simp
    have "?cr - ℋ(X) = (∑i ∈ L. fi i * ((log b (1/ (1/(b powr (cw_len i))))) + log b (fi i)))"
      using b_gt_1 1
      by (simp add: distrib_left sum.distrib)
    also have "… = (∑i ∈ L. fi i *((log b (fi i / (1/(b powr (cw_len i)))))))"
      using Finite_Cartesian_Product.sum_cong_aux[OF eqpi] by simp
    also from big_eq have
    "… = (∑i∈L. fi i * (log b (fi i * 𝒦 / (1 / b powr (cw_len i))))) + (∑i ∈ L. fi i) * log b (1/ 𝒦)"
      using 𝒦_pos
      by (simp add: sum_distrib_right sum.distrib)
    also have "… = (∑i∈L. fi i * (log b (fi i * 𝒦 / (1 / b powr (cw_len i))))) - log b (𝒦)"
      using 𝒦_pos
      by (simp add: log_inverse divide_inverse sum_one_L)
    also have "… = (∑ i ∈ L. fi i * log b (fi i / ?r i)) - log b (𝒦)"
      by (metis (mono_tags, opaque_lifting) divide_divide_eq_left divide_divide_eq_right)
    also have "… = KL_div L fi ?r - log b ( 𝒦)"
      using b_gt_1 𝒦_pos log_inverse KL_div_def
      by simp
    also have "… = KL_div L fi ?r + log b (1 / 𝒦)"
      using log_inverse b_val 𝒦_pos
      by (simp add: inverse_eq_divide)
    finally have code_ent_kl_log: "?cr - ℋ(X) = KL_div L fi ?r + log b (1 / 𝒦)" by simp
    have "(∑i∈L. ?r i) = 1"
      using sum_div_1[of "λi. 1 / (b powr (cw_len i))"] 𝒦_pos 𝒦_pow
      by simp
    moreover have "⋀i. 0 < ?r i" using b_gt_1 𝒦_pos by simp
    moreover have "(∑i∈L. fi i) = 1" using sum_one_L by simp
    ultimately have "0 ≤ KL_div L fi ?r"
      using KL_div_pos2[OF fin_L fi_pos] by simp
    hence "log b (1 / 𝒦) ≤ ?cr - ℋ(X)" using code_ent_kl_log by simp
    moreover from McMillan have "0 ≤ log b (1 / 𝒦)"
      using 𝒦_pos
      by (simp add: b_gt_1)
    ultimately show ?thesis by simp
qed

end

end