Theory Non_Regular_Languages

(* 
  File:     Non_Regular_Languages.thy
  Author:   Manuel Eberl <manuel@pruvisto.org>
  
  This file provides some tools for showing the non-regularity of a language, either 
  via an infinite set of equivalence classes or via the Pumping Lemma.
*)
section ‹Tools for showing non-regularity of a language›
theory Non_Regular_Languages
  imports Myhill
begin

subsection ‹Auxiliary material›

lemma bij_betw_image_quotient:
  "bij_betw (λy. f -` {y}) (f ` A) (A // {(a,b). f a = f b})"
  by (force simp: bij_betw_def inj_on_def image_image quotient_def)
   
lemma regular_Derivs_finite:
  fixes r :: "'a :: finite rexp"
  shows "finite (range (λw. Derivs w (lang r)))"
proof -
  have "?thesis ⟷ finite (UNIV // ≈lang r)"
    unfolding str_eq_conv_Derivs by (rule bij_betw_finite bij_betw_image_quotient)+
  also have "…" by (subst Myhill_Nerode [symmetric]) auto
  finally show ?thesis .
qed
  
lemma Nil_in_Derivs_iff: "[] ∈ Derivs w A ⟷ w ∈ A"
  by (auto simp: Derivs_def)

(* TODO: Move to distribution? *)
text ‹
  The following operation repeats a list $n$ times (usually written as $w ^ n$).
›
primrec repeat :: "nat ⇒ 'a list ⇒ 'a list" where
  "repeat 0 xs = []"
| "repeat (Suc n) xs = xs @ repeat n xs"
  
lemma repeat_Cons_left: "repeat (Suc n) xs = xs @ repeat n xs" by simp
    
lemma repeat_Cons_right: "repeat (Suc n) xs = repeat n xs @ xs"
  by (induction n) simp_all
    
lemma repeat_Cons_append_commute [simp]: "repeat n xs @ xs = xs @ repeat n xs"
  by (subst repeat_Cons_right [symmetric]) simp
    
lemma repeat_Cons_add [simp]: "repeat (m + n) xs = repeat m xs @ repeat n xs"
  by (induction m) simp_all
    
lemma repeat_Nil [simp]: "repeat n [] = []"
  by (induction n) simp_all
    
lemma repeat_conv_replicate: "repeat n xs = concat (replicate n xs)"
  by (induction n) simp_all

    
(* TODO: Move to distribution? *)
lemma nth_prefixes [simp]: "n ≤ length xs ⟹ prefixes xs ! n = take n xs"
  by (induction xs arbitrary: n) (auto simp: nth_Cons split: nat.splits)
    
lemma nth_suffixes [simp]: "n ≤ length xs ⟹ suffixes xs ! n = drop (length xs - n) xs"
  by (subst suffixes_conv_prefixes) (simp_all add: rev_take)

lemma length_take_prefixes:
  assumes "xs ∈ set (take n (prefixes ys))"
  shows   "length xs < n"
proof (cases "n ≤ Suc (length ys)")
  case True
  with assms obtain i where "i < n" "xs = take i ys"
    by (subst (asm) nth_image [symmetric]) auto
  thus ?thesis by simp
next
  case False
  with assms have "prefix xs ys" by simp
  hence "length xs ≤ length ys" by (rule prefix_length_le)
  also from False have "… < n" by simp
  finally show ?thesis .
qed


subsection ‹Non-regularity by giving an infinite set of equivalence classes›
  
text ‹
  Non-regularity can be shown by giving an infinite set of non-equivalent words (w.r.t. the 
  Myhill--Nerode relation).
›
lemma not_regular_langI:
  assumes "infinite B" "⋀x y. x ∈ B ⟹ y ∈ B ⟹ x ≠ y ⟹ ∃w. ¬(x @ w ∈ A ⟷ y @ w ∈ A)"
  shows   "¬regular_lang (A :: 'a :: finite list set)"
proof -
  have "(λw. Derivs w A) ` B ⊆ range (λw. Derivs w A)" by blast
  moreover from assms(2) have "inj_on (λw. Derivs w A) B"
    by (auto simp: inj_on_def Derivs_def)
  with assms(1) have "infinite ((λw. Derivs w A) ` B)"
    by (blast dest: finite_imageD)
  ultimately have "infinite (range (λw. Derivs w A))" by (rule infinite_super)
  with regular_Derivs_finite show ?thesis by blast
qed

lemma not_regular_langI':
  assumes "infinite B" "⋀x y. x ∈ B ⟹ y ∈ B ⟹ x ≠ y ⟹ ∃w. ¬(f x @ w ∈ A ⟷ f y @ w ∈ A)"
  shows   "¬regular_lang (A :: 'a :: finite list set)"
proof (rule not_regular_langI)
  from assms(2) have "inj_on f B" by (force simp: inj_on_def)
  with ‹infinite B› show "infinite (f ` B)" by (simp add: finite_image_iff)
qed (insert assms, auto)

  
subsection ‹The Pumping Lemma›

text ‹
  The Pumping lemma can be shown very easily from the Myhill--Nerode theorem: if we have 
  a word whose length is more than the (finite) number of equivalence classes, then it must 
  have two different prefixes in the same class and the difference between these two 
  prefixes can then be ``pumped''.
›
lemma pumping_lemma_aux:
  fixes A :: "'a list set"
  defines "δ ≡ λw. Derivs w A"
  defines "n ≡ card (range δ)"
  assumes "z ∈ A" "finite (range δ)" "length z ≥ n"
  shows "∃u v w. z = u @ v @ w ∧ length (u @ v) ≤ n ∧ v ≠ [] ∧ (∀i. u @ repeat i v @ w ∈ A)"
proof -
  define P where "P = set (take (Suc n) (prefixes z))"
  from ‹length z ≥ n› have [simp]: "card P = Suc n"
    unfolding P_def by (subst distinct_card) (auto intro!: distinct_take)
  have length_le: "length y ≤ n" if "y ∈ P" for y
    using length_take_prefixes[OF that [unfolded P_def]] by simp
      
  have "card (δ ` P) ≤ card (range δ)" by (intro card_mono assms) auto
  also from assms have "… < card P" by simp
  finally have "¬inj_on δ P" by (rule pigeonhole)
  then obtain a b where ab: "a ∈ P" "b ∈ P" "a ≠ b" "Derivs a A = Derivs b A"
    by (auto simp: inj_on_def δ_def)
  from ab have prefix_ab: "prefix a z" "prefix b z" by (auto simp: P_def dest: in_set_takeD)
  from ab have length_ab: "length a ≤ n" "length b ≤ n"
    by (simp_all add: length_le)
      
  have *: ?thesis 
    if uz': "prefix u z'" "prefix z' z" "length z' ≤ n" 
            "u ≠ z'" "Derivs z' A = Derivs u A" for u z'
  proof -
    from ‹prefix u z'› and ‹u ≠ z'› 
      obtain v where v [simp]: "z' = u @ v" "v ≠ []" 
      by (auto simp: prefix_def)
    from ‹prefix z' z› obtain w where [simp]: "z = u @ v @ w" 
      by (auto simp: prefix_def)
      
    hence [simp]: "Derivs (repeat i v) (Derivs u A) = Derivs u A" for i
      by (induction i) (use uz' in simp_all)
    have "Derivs z A = Derivs (u @ repeat i v @ w) A" for i
      using uz' by simp
    with ‹z ∈ A› and uz' have "∀i. u @ repeat i v @ w ∈ A"
      by (simp add: Nil_in_Derivs_iff [of _ A, symmetric])
    moreover have "z = u @ v @ w" by simp
    moreover from ‹length z' ≤ n› have "length (u @ v) ≤ n" by simp
    ultimately show ?thesis using ‹v ≠ []› by blast
  qed
  
  from prefix_ab have "prefix a b ∨ prefix b a" by (rule prefix_same_cases)
  with *[of a b] and *[of b a] and ab and prefix_ab and length_ab show ?thesis by blast
qed

theorem pumping_lemma:
  fixes r :: "'a :: finite rexp"
  obtains n where 
    "⋀z. z ∈ lang r ⟹ length z ≥ n ⟹ 
            ∃u v w. z = u @ v @ w ∧ length (u @ v) ≤ n ∧ v ≠ [] ∧ (∀i. u @ repeat i v @ w ∈ lang r)"
proof -
  let ?n = "card (range (λw. Derivs w (lang r)))"
  have "∃u v w. z = u @ v @ w ∧ length (u @ v) ≤ ?n ∧ v ≠ [] ∧ (∀i. u @ repeat i v @ w ∈ lang r)"
    if "z ∈ lang r" and "length z ≥ ?n" for z
    by (intro pumping_lemma_aux[of z] that regular_Derivs_finite)
  thus ?thesis by (rule that)
qed
  
corollary pumping_lemma_not_regular_lang:
  fixes A :: "'a :: finite list set"
  assumes "⋀n. length (z n) ≥ n" and "⋀n. z n ∈ A"
  assumes "⋀n u v w. z n = u @ v @ w ⟹ length (u @ v) ≤ n ⟹ v ≠ [] ⟹ 
             u @ repeat (i n u v w) v @ w ∉ A"
  shows   "¬regular_lang A"
proof
  assume "regular_lang A"
  then obtain r where r: "lang r = A" by blast
  from pumping_lemma[of r] obtain n
    where "z n ∈ lang r ⟹ n ≤ length (z n) ⟹
      ∃u v w. z n = u @ v @ w ∧ length (u @ v) ≤ n ∧ v ≠ [] ∧ (∀i. u @ repeat i v @ w ∈ lang r)"
    by metis
  from this and assms[of n] obtain u v w 
    where "z n = u @ v @ w" and "length (u @ v) ≤ n" and "v ≠ []" and 
          "⋀i. u @ repeat i v @ w ∈ lang r" by (auto simp: r)
  with assms(3)[of n u v w] show False by (auto simp: r)
qed


subsection ‹Examples›

text ‹
  The language of all words containing the same number of $a$s and $b$s is not regular.
›
lemma "¬regular_lang {w. length (filter id w) = length (filter Not w)}" (is "¬regular_lang ?A")
proof (rule not_regular_langI')
  show "infinite (UNIV :: nat set)" by simp
next
  fix m n :: nat assume "m ≠ n"
  hence "replicate m True @ replicate m False ∈ ?A" and
        "replicate n True @ replicate m False ∉ ?A" by simp_all
  thus "∃w. ¬(replicate m True @ w ∈ ?A ⟷ replicate n True @ w ∈ ?A)" by blast
qed

text ‹
  The language $\{a^i b^i\ |\ i \in \mathbb{N}\}$ is not regular.
›
lemma eq_replicate_iff:
  "xs = replicate n x ⟷ set xs ⊆ {x} ∧ length xs = n"
  using replicate_length_same[of xs x] by (subst eq_commute) auto
    
lemma replicate_eq_appendE:
  assumes "xs @ ys = replicate n x"
  obtains i j where "n = i + j" "xs = replicate i x" "ys = replicate j x"
proof -
  have "n = length (replicate n x)" by simp
  also note assms [symmetric]
  finally have "n = length xs + length ys" by simp
  moreover have "xs = replicate (length xs) x" and "ys = replicate (length ys) x"
    using assms by (auto simp: eq_replicate_iff)
  ultimately show ?thesis using that[of "length xs" "length ys"] by auto
qed
  
lemma "¬regular_lang (range (λi. replicate i True @ replicate i False))" (is "¬regular_lang ?A")
proof (rule pumping_lemma_not_regular_lang)
  fix n :: nat
  show "length (replicate n True @ replicate n False) ≥ n" by simp
  show "replicate n True @ replicate n False ∈ ?A" by simp
next
  fix n :: nat and u v w :: "bool list"
  assume decomp: "replicate n True @ replicate n False = u @ v @ w"
     and length_le: "length (u @ v) ≤ n" and v_ne: "v ≠ []"
  define w1 w2 where "w1 = take (n - length (u@v)) w" and "w2 = drop (n - length (u@v)) w"
  have w_decomp: "w = w1 @ w2" by (simp add: w1_def w2_def)
  
  have "replicate n True = take n (replicate n True @ replicate n False)" by simp
  also note decomp
  also have "take n (u @ v @ w) = u @ v @ w1" using length_le by (simp add: w1_def)
  finally have "u @ v @ w1 = replicate n True" by simp
  then obtain i j k 
    where uvw1: "n = i + j + k" "u = replicate i True" "v = replicate j True" "w1 = replicate k True"
    by (elim replicate_eq_appendE) auto
  
  have "replicate n False = drop n (replicate n True @ replicate n False)" by simp
  also note decomp
  finally have [simp]: "w2 = replicate n False" using length_le by (simp add: w2_def)
      
  have "u @ repeat (Suc (Suc 0)) v @ w = replicate (n + j) True @ replicate n False"
    by (simp add: uvw1 w_decomp replicate_add [symmetric])
  also have "… ∉ ?A"
  proof safe
    fix m assume *: "replicate (n + j) True @ replicate n False = 
                       replicate m True @ replicate m False" (is "?lhs = ?rhs")
    have "n = length (filter Not ?lhs)" by simp
    also note *
    also have "length (filter Not ?rhs) = m" by simp
    finally have [simp]: "m = n" by simp
    from * have "v = []" by (simp add: uvw1)
    with ‹v ≠ []› show False by contradiction
  qed
  finally show "u @ repeat (Suc (Suc 0)) v @ w ∉ ?A" .
qed
    
end