# Theory Non_Regular_Languages

(*
File:     Non_Regular_Languages.thy
Author:   Manuel Eberl <manuel@pruvisto.org>

This file provides some tools for showing the non-regularity of a language, either
via an infinite set of equivalence classes or via the Pumping Lemma.
*)
section ‹Tools for showing non-regularity of a language›
theory Non_Regular_Languages
imports Myhill
begin

subsection ‹Auxiliary material›

lemma bij_betw_image_quotient:
"bij_betw (λy. f - {y}) (f  A) (A // {(a,b). f a = f b})"
by (force simp: bij_betw_def inj_on_def image_image quotient_def)

lemma regular_Derivs_finite:
fixes r :: "'a :: finite rexp"
shows "finite (range (λw. Derivs w (lang r)))"
proof -
have "?thesis ⟷ finite (UNIV // ≈lang r)"
unfolding str_eq_conv_Derivs by (rule bij_betw_finite bij_betw_image_quotient)+
also have "…" by (subst Myhill_Nerode [symmetric]) auto
finally show ?thesis .
qed

lemma Nil_in_Derivs_iff: "[] ∈ Derivs w A ⟷ w ∈ A"
by (auto simp: Derivs_def)

(* TODO: Move to distribution? *)
text ‹
The following operation repeats a list $n$ times (usually written as $w ^ n$).
›
primrec repeat :: "nat ⇒ 'a list ⇒ 'a list" where
"repeat 0 xs = []"
| "repeat (Suc n) xs = xs @ repeat n xs"

lemma repeat_Cons_left: "repeat (Suc n) xs = xs @ repeat n xs" by simp

lemma repeat_Cons_right: "repeat (Suc n) xs = repeat n xs @ xs"
by (induction n) simp_all

lemma repeat_Cons_append_commute [simp]: "repeat n xs @ xs = xs @ repeat n xs"
by (subst repeat_Cons_right [symmetric]) simp

lemma repeat_Cons_add [simp]: "repeat (m + n) xs = repeat m xs @ repeat n xs"
by (induction m) simp_all

lemma repeat_Nil [simp]: "repeat n [] = []"
by (induction n) simp_all

lemma repeat_conv_replicate: "repeat n xs = concat (replicate n xs)"
by (induction n) simp_all

(* TODO: Move to distribution? *)
lemma nth_prefixes [simp]: "n ≤ length xs ⟹ prefixes xs ! n = take n xs"
by (induction xs arbitrary: n) (auto simp: nth_Cons split: nat.splits)

lemma nth_suffixes [simp]: "n ≤ length xs ⟹ suffixes xs ! n = drop (length xs - n) xs"
by (subst suffixes_conv_prefixes) (simp_all add: rev_take)

lemma length_take_prefixes:
assumes "xs ∈ set (take n (prefixes ys))"
shows   "length xs < n"
proof (cases "n ≤ Suc (length ys)")
case True
with assms obtain i where "i < n" "xs = take i ys"
by (subst (asm) nth_image [symmetric]) auto
thus ?thesis by simp
next
case False
with assms have "prefix xs ys" by simp
hence "length xs ≤ length ys" by (rule prefix_length_le)
also from False have "… < n" by simp
finally show ?thesis .
qed

subsection ‹Non-regularity by giving an infinite set of equivalence classes›

text ‹
Non-regularity can be shown by giving an infinite set of non-equivalent words (w.r.t. the
Myhill--Nerode relation).
›
lemma not_regular_langI:
assumes "infinite B" "⋀x y. x ∈ B ⟹ y ∈ B ⟹ x ≠ y ⟹ ∃w. ¬(x @ w ∈ A ⟷ y @ w ∈ A)"
shows   "¬regular_lang (A :: 'a :: finite list set)"
proof -
have "(λw. Derivs w A)  B ⊆ range (λw. Derivs w A)" by blast
moreover from assms(2) have "inj_on (λw. Derivs w A) B"
by (auto simp: inj_on_def Derivs_def)
with assms(1) have "infinite ((λw. Derivs w A)  B)"
by (blast dest: finite_imageD)
ultimately have "infinite (range (λw. Derivs w A))" by (rule infinite_super)
with regular_Derivs_finite show ?thesis by blast
qed

lemma not_regular_langI':
assumes "infinite B" "⋀x y. x ∈ B ⟹ y ∈ B ⟹ x ≠ y ⟹ ∃w. ¬(f x @ w ∈ A ⟷ f y @ w ∈ A)"
shows   "¬regular_lang (A :: 'a :: finite list set)"
proof (rule not_regular_langI)
from assms(2) have "inj_on f B" by (force simp: inj_on_def)
with ‹infinite B› show "infinite (f  B)" by (simp add: finite_image_iff)
qed (insert assms, auto)

subsection ‹The Pumping Lemma›

text ‹
The Pumping lemma can be shown very easily from the Myhill--Nerode theorem: if we have
a word whose length is more than the (finite) number of equivalence classes, then it must
have two different prefixes in the same class and the difference between these two
prefixes can then be pumped''.
›
lemma pumping_lemma_aux:
fixes A :: "'a list set"
defines "δ ≡ λw. Derivs w A"
defines "n ≡ card (range δ)"
assumes "z ∈ A" "finite (range δ)" "length z ≥ n"
shows "∃u v w. z = u @ v @ w ∧ length (u @ v) ≤ n ∧ v ≠ [] ∧ (∀i. u @ repeat i v @ w ∈ A)"
proof -
define P where "P = set (take (Suc n) (prefixes z))"
from ‹length z ≥ n› have [simp]: "card P = Suc n"
unfolding P_def by (subst distinct_card) (auto intro!: distinct_take)
have length_le: "length y ≤ n" if "y ∈ P" for y
using length_take_prefixes[OF that [unfolded P_def]] by simp

have "card (δ  P) ≤ card (range δ)" by (intro card_mono assms) auto
also from assms have "… < card P" by simp
finally have "¬inj_on δ P" by (rule pigeonhole)
then obtain a b where ab: "a ∈ P" "b ∈ P" "a ≠ b" "Derivs a A = Derivs b A"
by (auto simp: inj_on_def δ_def)
from ab have prefix_ab: "prefix a z" "prefix b z" by (auto simp: P_def dest: in_set_takeD)
from ab have length_ab: "length a ≤ n" "length b ≤ n"
by (simp_all add: length_le)

have *: ?thesis
if uz': "prefix u z'" "prefix z' z" "length z' ≤ n"
"u ≠ z'" "Derivs z' A = Derivs u A" for u z'
proof -
from ‹prefix u z'› and ‹u ≠ z'›
obtain v where v [simp]: "z' = u @ v" "v ≠ []"
by (auto simp: prefix_def)
from ‹prefix z' z› obtain w where [simp]: "z = u @ v @ w"
by (auto simp: prefix_def)

hence [simp]: "Derivs (repeat i v) (Derivs u A) = Derivs u A" for i
by (induction i) (use uz' in simp_all)
have "Derivs z A = Derivs (u @ repeat i v @ w) A" for i
using uz' by simp
with ‹z ∈ A› and uz' have "∀i. u @ repeat i v @ w ∈ A"
by (simp add: Nil_in_Derivs_iff [of _ A, symmetric])
moreover have "z = u @ v @ w" by simp
moreover from ‹length z' ≤ n› have "length (u @ v) ≤ n" by simp
ultimately show ?thesis using ‹v ≠ []› by blast
qed

from prefix_ab have "prefix a b ∨ prefix b a" by (rule prefix_same_cases)
with *[of a b] and *[of b a] and ab and prefix_ab and length_ab show ?thesis by blast
qed

theorem pumping_lemma:
fixes r :: "'a :: finite rexp"
obtains n where
"⋀z. z ∈ lang r ⟹ length z ≥ n ⟹
∃u v w. z = u @ v @ w ∧ length (u @ v) ≤ n ∧ v ≠ [] ∧ (∀i. u @ repeat i v @ w ∈ lang r)"
proof -
let ?n = "card (range (λw. Derivs w (lang r)))"
have "∃u v w. z = u @ v @ w ∧ length (u @ v) ≤ ?n ∧ v ≠ [] ∧ (∀i. u @ repeat i v @ w ∈ lang r)"
if "z ∈ lang r" and "length z ≥ ?n" for z
by (intro pumping_lemma_aux[of z] that regular_Derivs_finite)
thus ?thesis by (rule that)
qed

corollary pumping_lemma_not_regular_lang:
fixes A :: "'a :: finite list set"
assumes "⋀n. length (z n) ≥ n" and "⋀n. z n ∈ A"
assumes "⋀n u v w. z n = u @ v @ w ⟹ length (u @ v) ≤ n ⟹ v ≠ [] ⟹
u @ repeat (i n u v w) v @ w ∉ A"
shows   "¬regular_lang A"
proof
assume "regular_lang A"
then obtain r where r: "lang r = A" by blast
from pumping_lemma[of r] obtain n
where "z n ∈ lang r ⟹ n ≤ length (z n) ⟹
∃u v w. z n = u @ v @ w ∧ length (u @ v) ≤ n ∧ v ≠ [] ∧ (∀i. u @ repeat i v @ w ∈ lang r)"
by metis
from this and assms[of n] obtain u v w
where "z n = u @ v @ w" and "length (u @ v) ≤ n" and "v ≠ []" and
"⋀i. u @ repeat i v @ w ∈ lang r" by (auto simp: r)
with assms(3)[of n u v w] show False by (auto simp: r)
qed

subsection ‹Examples›

text ‹
The language of all words containing the same number of $a$s and $b$s is not regular.
›
lemma "¬regular_lang {w. length (filter id w) = length (filter Not w)}" (is "¬regular_lang ?A")
proof (rule not_regular_langI')
show "infinite (UNIV :: nat set)" by simp
next
fix m n :: nat assume "m ≠ n"
hence "replicate m True @ replicate m False ∈ ?A" and
"replicate n True @ replicate m False ∉ ?A" by simp_all
thus "∃w. ¬(replicate m True @ w ∈ ?A ⟷ replicate n True @ w ∈ ?A)" by blast
qed

text ‹
The language $\{a^i b^i\ |\ i \in \mathbb{N}\}$ is not regular.
›
lemma eq_replicate_iff:
"xs = replicate n x ⟷ set xs ⊆ {x} ∧ length xs = n"
using replicate_length_same[of xs x] by (subst eq_commute) auto

lemma replicate_eq_appendE:
assumes "xs @ ys = replicate n x"
obtains i j where "n = i + j" "xs = replicate i x" "ys = replicate j x"
proof -
have "n = length (replicate n x)" by simp
also note assms [symmetric]
finally have "n = length xs + length ys" by simp
moreover have "xs = replicate (length xs) x" and "ys = replicate (length ys) x"
using assms by (auto simp: eq_replicate_iff)
ultimately show ?thesis using that[of "length xs" "length ys"] by auto
qed

lemma "¬regular_lang (range (λi. replicate i True @ replicate i False))" (is "¬regular_lang ?A")
proof (rule pumping_lemma_not_regular_lang)
fix n :: nat
show "length (replicate n True @ replicate n False) ≥ n" by simp
show "replicate n True @ replicate n False ∈ ?A" by simp
next
fix n :: nat and u v w :: "bool list"
assume decomp: "replicate n True @ replicate n False = u @ v @ w"
and length_le: "length (u @ v) ≤ n" and v_ne: "v ≠ []"
define w1 w2 where "w1 = take (n - length (u@v)) w" and "w2 = drop (n - length (u@v)) w"
have w_decomp: "w = w1 @ w2" by (simp add: w1_def w2_def)

have "replicate n True = take n (replicate n True @ replicate n False)" by simp
also note decomp
also have "take n (u @ v @ w) = u @ v @ w1" using length_le by (simp add: w1_def)
finally have "u @ v @ w1 = replicate n True" by simp
then obtain i j k
where uvw1: "n = i + j + k" "u = replicate i True" "v = replicate j True" "w1 = replicate k True"
by (elim replicate_eq_appendE) auto

have "replicate n False = drop n (replicate n True @ replicate n False)" by simp
also note decomp
finally have [simp]: "w2 = replicate n False" using length_le by (simp add: w2_def)

have "u @ repeat (Suc (Suc 0)) v @ w = replicate (n + j) True @ replicate n False"
by (simp add: uvw1 w_decomp replicate_add [symmetric])
also have "… ∉ ?A"
proof safe
fix m assume *: "replicate (n + j) True @ replicate n False =
replicate m True @ replicate m False" (is "?lhs = ?rhs")
have "n = length (filter Not ?lhs)" by simp
also note *
also have "length (filter Not ?rhs) = m" by simp
finally have [simp]: "m = n" by simp
from * have "v = []" by (simp add: uvw1)
with ‹v ≠ []› show False by contradiction
qed
finally show "u @ repeat (Suc (Suc 0)) v @ w ∉ ?A" .
qed

end