Theory Activation_Functions
chapter‹Activation Functions›
theory Activation_Functions
imports
"HOL-Analysis.Derivative"
TensorFlow_Import
NN_Common
"Interval_Analysis.Affine_Functions"
Jordan_Normal_Form.Matrix
begin
text‹
In this theory, we provide definitions for the most common activation functions.
Moreover, we also provide an ML-API for working with HOL-terms of activation functions.
›
section‹Defining Activation Functions and Their Derivatives›
text‹
Many common activation functions use the function @{term "f(x) = e^x"} (written
@{term "f(x) = exp (x)"}. For those activation functions, we also define approximations
using the Taylor series of the exponential function:
›
definition
‹exp⇩t⇩a⇩y⇩l⇩o⇩r n x = (∑i = 0..n . x^i / fact i)›
lemma exp⇩t⇩a⇩y⇩l⇩o⇩r⇩2: "exp⇩t⇩a⇩y⇩l⇩o⇩r 2 (x::real) = (1::real) + x + x^2/2"
by(code_simp, simp)
subsection‹Activation Functions›
definition
‹identity = (λv. v)›
lemma identity_linear[simp]: ‹affine_fun identity›
unfolding identity_def by simp
definition binary_step :: ‹'a::{zero, ord, one, zero} ⇒ 'a› where
‹binary_step = (λ v. if v ≤ 0 then 0 else 1)›
hide_const sign
definition
‹sign = sgn›
definition
‹softsign = (λv. v / (¦v¦ + 1))›
definition
‹logistic L k v⇩0 = (λv. L / (1 + exp(-k * (v -v⇩0))))›
definition
‹logistic⇩t⇩a⇩y⇩l⇩o⇩r n L k v⇩0 = (λv. L / (1 + (exp⇩t⇩a⇩y⇩l⇩o⇩r n (-k * (v -v⇩0)))))›
definition sigmoid :: "real ⇒ real" where
‹sigmoid = (λv. 1 / (1 + exp(-v)))›
definition
‹sigmoid⇩t⇩a⇩y⇩l⇩o⇩r n = (λv. 1 / (1 + (exp⇩t⇩a⇩y⇩l⇩o⇩r n (-v))))›
lemma ‹sigmoid = (logistic (1.0::real) 1.0 0)›
unfolding sigmoid_def logistic_def by auto
lemma ‹sigmoid⇩t⇩a⇩y⇩l⇩o⇩r n = (logistic⇩t⇩a⇩y⇩l⇩o⇩r n (1.0::real) 1.0 0)›
unfolding sigmoid⇩t⇩a⇩y⇩l⇩o⇩r_def logistic⇩t⇩a⇩y⇩l⇩o⇩r_def by auto
definition
‹swish = (λv. v * (sigmoid v))›
definition
‹swish⇩t⇩a⇩y⇩l⇩o⇩r n = (λv. v * (sigmoid⇩t⇩a⇩y⇩l⇩o⇩r n v))›
definition
‹relu = (λv. max 0 v)›
definition
‹generalized_relu α m t = (λv. case m of Some m' ⇒ min (if v ≤ t then α * v else v) m'
| None ⇒ if v ≤ t then α * v else v)›
lemma ‹relu = (generalized_relu (0.0::real) None (0.0))›
unfolding relu_def generalized_relu_def by auto
definition
‹softplus = (λv. ln (1 + exp v))›
definition
‹elu α = (λv. if v ≤ 0 then α * ((exp v)-1) else v)›
definition
‹elu⇩t⇩a⇩y⇩l⇩o⇩r n α = (λv. if v ≤ 0 then α * ((exp⇩t⇩a⇩y⇩l⇩o⇩r n v)-1) else v)›
definition
‹selu = (λv. let α = 1.67326324;
scale = 1.05070098
in if v ≤ 0 then scale * α * ((exp v)-1) else scale * v)›
definition
‹selu⇩t⇩a⇩y⇩l⇩o⇩r n = (λv. let α = 1.67326324;
scale = 1.05070098
in if v ≤ 0 then scale * α * ((exp⇩t⇩a⇩y⇩l⇩o⇩r n v)-1) else scale * v)›
definition
‹prelu α = (λv. if v < 0 then α * v else v)›
definition
‹silu = (λv. v / (1 + (exp (-v))))›
definition
‹silu⇩t⇩a⇩y⇩l⇩o⇩r n = (λv. v / (1 + (exp⇩t⇩a⇩y⇩l⇩o⇩r n (-v))))›
definition
‹gaussian = (λv. exp (- v⇧2))›
definition
‹gaussian⇩t⇩a⇩y⇩l⇩o⇩r n = (λv. exp⇩t⇩a⇩y⇩l⇩o⇩r n (- v⇧2))›
definition
‹hard_sigmoid = (λv. if v < -2.5 then 0 else if v > 2.5 then 1 else 0.2 * v + 0.5)›
definition
‹gelu_approx = (λv. 0.5 * v * (1 + tanh(sqrt(2 / pi) * (v + 0.044715 * v * v * v))))›
text‹
Note, the error function @{term "erf"} is available in the AFP entry~\<^cite>‹"eberl:erf:2018"›, which can
be used for defining a non-approximated @{term "gelu"} activation function.
›
definition softmax :: "('a::{banach,real_normed_algebra_1,inverse}) list ⇒ 'a list" where
‹softmax vs = map (λ v. exp v / (∑v'←vs. exp v')) vs›
definition msoftmax :: "('a::{banach,real_normed_algebra_1,inverse}) vec ⇒ 'a vec" where
‹msoftmax vs = map_vec (λ v. exp v / (∑v'← (list_of_vec vs). exp v')) vs›
definition softmax⇩t⇩a⇩y⇩l⇩o⇩r :: "nat ⇒ ('a::{banach,real_normed_algebra_1,inverse}) list ⇒ 'a list" where
‹softmax⇩t⇩a⇩y⇩l⇩o⇩r n vs = map (λ v. (exp⇩t⇩a⇩y⇩l⇩o⇩r n v) / (∑v'←vs. (exp⇩t⇩a⇩y⇩l⇩o⇩r n v'))) vs›
definition msoftmax⇩t⇩a⇩y⇩l⇩o⇩r :: "nat ⇒ ('a::{banach,real_normed_algebra_1,inverse}) vec ⇒ 'a vec" where
‹msoftmax⇩t⇩a⇩y⇩l⇩o⇩r n vs = map_vec (λ v. (exp⇩t⇩a⇩y⇩l⇩o⇩r n v) / (∑v'← (list_of_vec vs). (exp⇩t⇩a⇩y⇩l⇩o⇩r n v'))) vs›
lemma softmax⇩t⇩a⇩y⇩l⇩o⇩r⇩2:
"softmax⇩t⇩a⇩y⇩l⇩o⇩r 2 vs = map (λ (v::real). (1 + v + v⇧2/2) / (foldl (+) 0 (map (λ v'. 1 + v' + v'⇧2/2) vs))) vs"
unfolding softmax⇩t⇩a⇩y⇩l⇩o⇩r_def exp⇩t⇩a⇩y⇩l⇩o⇩r⇩2
by (simp add: Groups.add_ac(2) fold_plus_sum_list_rev foldl_conv_fold)
lemma softmax⇩t⇩a⇩y⇩l⇩o⇩r⇩2': "softmax⇩t⇩a⇩y⇩l⇩o⇩r 2 vs = map (λ (v::real). (1 + v + v⇧2/2) / (foldl (λa x. a + (1 + x + x⇧2 / 2)) 0 vs)) vs"
apply(simp add: softmax⇩t⇩a⇩y⇩l⇩o⇩r⇩2)
by(code_simp, simp)
definition
‹argmax vs = map (λ v. if v = Max (set vs) then 1 else 0) vs›
text‹
\autoref{tab:tensorflow-activation} provides a mapping from our names of the activation functions
to the names used by TensorFlow (see 🌐‹https://www.tensorflow.org/api_docs/python/tf/keras/activations/›).
\begin{landscape}
\begin{table}
\caption{Mapping of the activation functions supported by TensorFlow.}
\label{tab:tensorflow-activation}
\begin{center}
\small\renewcommand{\arraystretch}{1.2}
\begin{tabular}{@ {}llp{14cm}@ {}}
\toprule
& TensorFlow 2.8.0 & Definition\\
\cmidrule(r){2-3}
@{const ‹identity›} & linear & \vspace{-.7cm}@{thm [display, margin=120] "identity_def"}\vspace{-0.9cm}\\
@{const ‹softsign›} & softsign & \vspace{-.7cm}@{thm [display, margin=120] "softsign_def"}\vspace{-0.9cm}\\
@{const ‹sigmoid›} & sigmoid & \vspace{-.7cm}@{thm [display, margin=120] "sigmoid_def"}\vspace{-0.9cm}\\
@{const ‹sigmoid⇩t⇩a⇩y⇩l⇩o⇩r›} & \multicolumn{1}{c}{--} & \vspace{-.7cm}@{thm [display, margin=120] "sigmoid⇩t⇩a⇩y⇩l⇩o⇩r_def"}\vspace{-0.9cm}\\
@{const ‹swish›} & swish & \vspace{-.7cm}@{thm [display, margin=120] "swish_def"}\vspace{-0.9cm}\\
@{const ‹swish⇩t⇩a⇩y⇩l⇩o⇩r›} & \multicolumn{1}{c}{--} & \vspace{-.7cm}@{thm [display, margin=120] "swish⇩t⇩a⇩y⇩l⇩o⇩r_def"}\vspace{-0.9cm}\\
@{const ‹tanh›} & thanh & \vspace{-.7cm}@{thm [display, margin=120] "tanh_def"}\vspace{-0.9cm}\\
@{const ‹generalized_relu›} & relu & \vspace{-.7cm}@{thm [display, margin=120] "generalized_relu_def"}\vspace{-0.9cm}\\
@{const ‹relu›} & relu (with default parameters) & \vspace{-.7cm}@{thm [display, margin=120] "relu_def"}\vspace{-0.9cm}\\
@{const ‹gelu_approx›} & gelu (approx=True) & \vspace{-.7cm}@{thm [display, margin=120] "gelu_approx_def"}\vspace{-0.9cm}\\
\multicolumn{1}{c}{--} & gelu (approx=False) & \multicolumn{1}{c}{--}\\
@{const ‹softplus›} & softplus & \vspace{-.7cm}@{thm [display, margin=120] "softplus_def"}\vspace{-0.9cm}\\
@{const ‹elu›} & elu & \vspace{-.7cm}@{thm [display, margin=120] "elu_def"}\vspace{-0.9cm}\\
@{const ‹elu⇩t⇩a⇩y⇩l⇩o⇩r›} & \multicolumn{1}{c}{--} & \vspace{-.7cm}@{thm [display, margin=120] "elu⇩t⇩a⇩y⇩l⇩o⇩r_def"}\vspace{-0.9cm}\\
@{const ‹selu›} & selu &\vspace{-.7cm}@{thm [display, margin=120] "selu_def"}\vspace{-0.9cm}\\
@{const ‹selu⇩t⇩a⇩y⇩l⇩o⇩r›} & \multicolumn{1}{c}{--} & \vspace{-.7cm}@{thm [display, margin=120] "selu⇩t⇩a⇩y⇩l⇩o⇩r_def"}\vspace{-0.9cm}\\
@{const ‹exp›} & exponential & \vspace{-.7cm}@{thm [display, margin=120] "exp_def"}\vspace{-0.9cm}\\
@{const ‹exp⇩t⇩a⇩y⇩l⇩o⇩r›} & \multicolumn{1}{c}{--} & \vspace{-.7cm}@{thm [display, margin=120] "exp⇩t⇩a⇩y⇩l⇩o⇩r_def"}\vspace{-0.9cm}\\
@{const ‹hard_sigmoid›} & hard\_sigmoid & \vspace{-.7cm}@{thm [display, margin=120] "hard_sigmoid_def"}\vspace{-0.9cm}\\
@{const ‹softmax›} & softmax & \vspace{-.7cm}@{thm [display, margin=120] "softmax_def"}\vspace{-0.9cm}\\
@{const ‹softmax⇩t⇩a⇩y⇩l⇩o⇩r›} & \multicolumn{1}{c}{--} & \vspace{-.7cm}@{thm [display, margin=120] "softmax⇩t⇩a⇩y⇩l⇩o⇩r_def"}\vspace{-0.9cm}\\
\bottomrule
\end{tabular}\end{center}\end{table}
\end{landscape}
›
subsection‹Derivatives of Activation Functions›
lemma has_real_derivative_transform:
‹x ∈ s ⟹ (⋀x. x ∈ s ⟹ g x = f x) ⟹ (f has_real_derivative f') (at x within s)
⟹ (g has_real_derivative f') (at x within s)›
by (simp add: has_derivative_transform has_field_derivative_def)
lemma one_plus_exp_eq: "(1 + exp v) = (exp v) * (1 + exp (- v)) "
by (simp add: distrib_left exp_minus_inverse)
definition ‹identity' = (λ v. 1.0)›
lemma identity'[simp]: ‹(identity has_real_derivative (identity' v)) (at v)›
by(simp add:identity_def identity'_def)
definition ‹logistic' L k v⇩0 = (λ v. (exp( (-k)*(v-v⇩0)) * k * L) / (1 + exp((-k)*(v-v⇩0)))⇧2)›
lemma logistic'[simp]: ‹((logistic L k v⇩0) has_real_derivative ((logistic' L k v⇩0) v)) (at v)›
apply (simp add:logistic_def logistic'_def, intro derivative_eq_intros, simp_all)
subgoal by (metis add_eq_0_iff exp_ge_zero le_minus_one_simps(3))
subgoal by (simp add: power2_eq_square)
done
definition ‹tanh' = (λ v. 1 - ((tanh v)⇧2))›
lemma tanh'[simp]: ‹(tanh has_real_derivative (tanh' v)) (at v)›
by (auto intro: derivative_eq_intros simp add:tanh'_def)
definition ‹softplus' = (λ v. 1 / (1 + exp(-v)))›
lemma softplus'[simp]: ‹(softplus has_real_derivative (softplus' v)) (at v)›
apply (simp add: softplus_def softplus'_def, intro derivative_eq_intros, simp_all add: add_pos_pos)
by (metis one_plus_exp_eq add.left_neutral exp_not_eq_zero mult.right_neutral
nonzero_divide_mult_cancel_left)
definition ‹prelu1' = (λ v. 1)›
lemma prelu1'[simp]: ‹((prelu 1) has_real_derivative (prelu1' v)) (at v)›
proof (-)
have *: ‹(λ v. if v < (0::real) then (1::real) * v else v) = (λ v. v)› by auto
show ?thesis
by (simp add: prelu_def prelu1'_def if_split *)
qed
definition ‹silu' = (λ v. (1 + exp (-v) + v * (exp (-v)) ) / ((1 + exp(-v))⇧2))›
lemma silu'[simp]: ‹(silu has_real_derivative (silu' v)) (at v)›
apply(simp add:silu_def silu'_def)
apply (intro derivative_eq_intros, simp_all)
subgoal by (metis add_eq_0_iff exp_ge_zero le_minus_one_simps(3))
subgoal by (simp add: power2_eq_square)
done
definition ‹gaussian' = (λ v. -2 * v * exp (- v⇧2))›
lemma gaussian'[simp]: ‹(gaussian has_real_derivative (gaussian' v)) (at v)›
by (simp add:gaussian_def gaussian'_def, intro derivative_eq_intros, simp_all, simp)
subsection‹Single Class Folding Activation Functions›