Theory GroupF

section‹Group by Function›
theory GroupF
imports Main
begin

text‹Grouping elements of a list according to a function.›

fun groupF ::  "('a  'b)  'a list  'a list list"  where
  "groupF f [] = []" |
  "groupF f (x#xs) = (x#(filter (λy. f x = f y) xs))#(groupF f (filter (λy. f x  f y) xs))"

text‹trying a more efficient implementation of @{term groupF}
context
begin
  private fun select_p_tuple :: "('a  bool)  'a  ('a list × 'a list)  ('a list × 'a list)"
  where
    "select_p_tuple p x (ts,fs) = (if p x then (x#ts, fs) else (ts, x#fs))"
  
  private definition partition_tailrec :: "('a  bool)  'a list  ('a list × 'a list)"
  where
    "partition_tailrec p xs = foldr (select_p_tuple p) xs ([],[])"
  
  private lemma partition_tailrec: "partition_tailrec f as =  (filter f as,  filter (λx. ¬f x) as)"
  proof - 
    {fix ts_accu fs_accu
      have "foldr (select_p_tuple f) as (ts_accu, fs_accu) =
              (filter f as @ ts_accu,  filter (λx. ¬f x) as @ fs_accu)"
      by(induction as arbitrary: ts_accu fs_accu) simp_all
    } thus ?thesis unfolding partition_tailrec_def by simp
  qed
  
  private lemma
    "groupF f (x#xs) = (let (ts, fs) = partition_tailrec (λy. f x = f y) xs in (x#ts)#(groupF f fs))"
  by(simp add: partition_tailrec)
  
  (*is this more efficient?*)
  private function groupF_code ::  "('a  'b)  'a list  'a list list"  where
    "groupF_code f [] = []" |
    "groupF_code f (x#xs) = (let
                               (ts, fs) = partition_tailrec (λy. f x = f y) xs
                             in
                               (x#ts)#(groupF_code f fs))"
  by(pat_completeness) auto
  
  private termination groupF_code
    apply(relation "measure (λ(f,as). length (filter (λx. (λy. f x = f y) x) as))")
     apply(simp; fail)
    apply(simp add: partition_tailrec)
    using le_imp_less_Suc length_filter_le by blast
  
  lemma groupF_code[code]: "groupF f as = groupF_code f as"
    by(induction f as rule: groupF_code.induct) (simp_all add: partition_tailrec)
  
  export_code groupF checking SML
end

lemma groupF_concat_set: "set (concat (groupF f xs)) = set xs"
  proof(induction f xs rule: groupF.induct)
  case 2 thus ?case by (simp) blast
  qed(simp)

lemma groupF_Union_set: "(x  set (groupF f xs). set x) = set xs"
  proof(induction f xs rule: groupF.induct)
  case 2 thus ?case by (simp) blast
  qed(simp)

lemma groupF_set: "X  set (groupF f xs). x  set X. x  set xs"
  using groupF_concat_set by fastforce

lemma groupF_equality:
  defines "same f A  a1  set A. a2  set A. f a1 = f a2"
  shows "A  set (groupF f xs). same f A"
  proof(induction f xs rule: groupF.induct)
    case 1 thus ?case by simp
  next
    case (2 f x xs)
      have groupF_fst:
        "groupF f (x # xs) = (x # [yxs . f x = f y]) # groupF f [yxs . f x  f y]" by force
      have step: " Aset [x # [yxs . f x = f y]]. same f A" unfolding same_def by fastforce
      with 2 show ?case unfolding groupF_fst by fastforce
  qed

lemma groupF_nequality: "A  set (groupF f xs)  B  set (groupF f xs)  A  B 
     a  set A. b  set B. f a  f b"
  proof(induction f xs rule: groupF.induct)
  case 1 thus ?case by simp
  next
  case 2 thus ?case
    apply -
    apply(subst (asm) groupF.simps)+
    using groupF_set by fastforce (*1s*)
  qed

lemma groupF_cong: fixes xs::"'a list" and f1::"'a  'b" and f2::"'a  'c"
  assumes "x  set xs. y  set xs. (f1 x = f1 y  f2 x = f2 y)"
  shows "groupF f1 xs = groupF f2 xs"
  using assms proof(induction f1 xs rule: groupF.induct)
    case (2 f x xs) thus ?case using filter_cong[of xs xs "λy. f x = f y" "λy. f2 x = f2 y"]
                                     filter_cong[of xs xs "λy. f x  f y" "λy. f2 x  f2 y"] by auto
  qed (simp)

lemma groupF_empty: "groupF f xs  []  xs  []"
  by(induction f xs rule: groupF.induct) auto
lemma groupF_empty_elem: "x  set (groupF f xs)  x  []"
  by(induction f xs rule: groupF.induct) auto

lemma groupF_distinct: "distinct xs  distinct (concat (groupF f xs))"
  by (induction f xs rule: groupF.induct) (auto simp add: groupF_Union_set)

text‹It is possible to use
    @{term "map (map fst) (groupF snd (map (λx. (x, f x)) P))"}
  instead of
    @{term "groupF f P"}
  for the following reasons:
    @{const groupF} executes its compare function (first parameter) very often;
    it always tests for @{term "(f x = f y)"}.
    The function @{term f} may be really expensive.
    At least polyML does not share the result of @{term f} but (probably) always recomputes (part of) it.
    The optimization pre-computes @{term f} and tells @{const groupF} to use
    a really cheap function (@{const snd}) to compare.
    The following lemma tells that those are equal.›
  (* is this also faster for Haskell?*)

lemma groupF_tuple: "groupF f xs = map (map fst) (groupF snd (map (λx. (x, f x)) xs))"
  proof(induction f xs rule: groupF.induct)
  case (1 f) thus ?case by simp
  next
  case (2 f x xs)
    have g1: "[yxs . f x = f y] = map fst [ymap (λx. (x, f x)) xs . f x = snd y]"
      proof(induction xs arbitrary: f x)
      case Cons thus ?case by fastforce
      qed(simp)
    have g2: "(map (λx. (x, f x)) [yxs . f x  f y]) = [ymap (λx. (x, f x)) xs . f x  snd y]"
      proof(induction xs)
      case Cons thus ?case by fastforce
      qed(simp)
    from 2 g1 g2 show ?case by simp
  qed
end