Theory C_Appendices

(******************************************************************************
 * Isabelle/C
 *
 * Copyright (c) 2018-2019 Université Paris-Saclay, Univ. Paris-Sud, France
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *
 *     * Redistributions in binary form must reproduce the above
 *       copyright notice, this list of conditions and the following
 *       disclaimer in the documentation and/or other materials provided
 *       with the distribution.
 *
 *     * Neither the name of the copyright holders nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************)

chapter ‹A Resume on Isabelle/C: Commands, Control Attributes and Programming Infrastructure›

theory C_Appendices
  imports "../examples/C2"
          Isar_Ref.Base
begin

(*<*)
ML ― ‹🗏‹~~/src/Doc/antiquote_setup.ML››
(*  Author:     Frédéric Tuong, Université Paris-Saclay
    Analogous to:
(*  Title:      Doc/antiquote_setup.ML
    Author:     Makarius

Auxiliary antiquotations for the Isabelle manuals.
*)*)
structure C_Antiquote_Setup =
struct

(* misc utils *)

fun translate f = Symbol.explode #> map f #> implode;

val clean_string = translate
  (fn "_" => "\\_"
    | "#" => "\\#"
    | "$" => "\\$"
    | "%" => "\\%"
    | "<" => "$<$"
    | ">" => "$>$"
    | "{" => "\\{"
    | "|" => "$\\mid$"
    | "}" => "\\}"
    | "‐" => "-"
    | c => c);

fun clean_name "…" = "dots"
  | clean_name ".." = "ddot"
  | clean_name "." = "dot"
  | clean_name "_" = "underscore"
  | clean_name "{" = "braceleft"
  | clean_name "}" = "braceright"
  | clean_name s = s |> translate (fn "_" => "-"
                                    | "‐" => "-"
                                    | "#" => "symbol-hash"
                                    | "≈" => "symbol-lower-approx"
                                    | "⇓" => "symbol-upper-down"
                                    | c => c);


(* Isabelle/Isar entities (with index) *)

local

val arg = enclose "{" "}" o clean_string;

fun entity check markup binding index =
  Document_Output.antiquotation_raw
    (binding |> Binding.map_name (fn name => name ^
      (case index of NONE => "" | SOME true => "_def" | SOME false => "_ref")))
    (Scan.lift (Scan.optional (Args.parens Args.name) "" -- Parse.position Args.name))
    (fn ctxt => fn (logic, (name, pos)) =>
      let
        val kind = translate (fn "_" => " " | c => c) (Binding.name_of binding);
        val hyper_name =
          "{" ^ Long_Name.append kind (Long_Name.append logic (clean_name name)) ^ "}";
        val hyper =
          enclose ("\\hyperlink" ^ hyper_name ^ "{") "}" #>
          index = SOME true ? enclose ("\\hypertarget" ^ hyper_name ^ "{") "}";
        val idx =
          (case index of
            NONE => ""
          | SOME is_def =>
              "\\index" ^ (if is_def then "def" else "ref") ^ arg logic ^ arg kind ^ arg name);
        val _ =
          if Context_Position.is_reported ctxt pos then ignore (check ctxt (name, pos)) else ();
        val latex =
          idx ^
          (Output.output name
            |> (if markup = "" then I else enclose ("\\" ^ markup ^ "{") "}")
            |> hyper o enclose "\\mbox{\\isa{" "}}");
      in Latex.string latex end);

fun entity_antiqs check markup kind =
  entity check markup kind NONE #>
  entity check markup kind (SOME true) #>
  entity check markup kind (SOME false);

in

val _ =
  Theory.setup
   (entity_antiqs C_Annotation.check_command "isacommand" bindingannotation);

end;

end;
(*>*)

section ‹Syntax Commands for Isabelle/C›

subsection ‹Toplevel Commands and Control Attributes›

(* @{attribute_def Cenv0} cases LaTeX error *)
text ‹
  \begin{matharray}{rcl}
    @{command_def "C_file"} & : & local_theory → local_theory› \\
    @{command_def "C"} & : & local_theory → local_theory› \\
    @{command_def "C_export_boot"} & : & local_theory → local_theory› \\
    @{command_def "C_prf"} & : & proof → proof› \\
    @{command_def "C_val"} & : & any →› \\
    @{command_def "C_export_file"} & : & any →› \\
  \end{matharray}
  \begin{tabular}{rcll}
    @{attribute_def C_lexer_trace} & : & attribute› & default false› \\
    @{attribute_def C_parser_trace} & : & attribute› & default false› \\
    @{attribute_def C_ML_verbose} & : & attribute› & default true› \\
    Cenv0 & : & attribute› & default empty› \\
    Crule0 & : & attribute› & default translation_unit› \\
  \end{tabular}

  rail@@{command C_file} @{syntax name} ';'?
    ;
    (@@{command C} | @@{command C_export_boot} | @@{command C_prf} |
      @@{command C_val}) @{syntax text}
    ;
    @@{command C_export_file}
    ;

   C_file name› reads the given C file, and let any attached
  semantic back-ends to proceed for further subsequent evaluation. Top-level C bindings are stored
  within the (global or local) theory context; the initial environment is set by default to be an
  empty one, or the one returned by a previous C_file (depending on attribute
  Cenv0). The entry-point of the grammar taken as initial starting point of the parser
  is read from attribute Crule0 (see
  🌐‹https://www.haskell.org/happy/doc/html/sec-directives.html#sec-parser-name›).
  Multiple C_file commands may be used to build larger C projects if
  they are all written in a single theory file (existing parent theories are ignored, and not
  affecting the current working theory).

   C is similar to
  C_file, but evaluates directly the
  given text›. Top-level resulting bindings are stored
  within the (global or local) theory context.

   C_export_boot is similar to
  ML_export, except that the code in
  input is understood as being processed by
  C instead of ML.

   C_prf is similar to
  ML_prf, except that the code in input
  is understood as being processed by
  C instead of ML.

   C_val is similar to
  ML_val, except that the code in input
  is understood as being processed by
  C instead of ML.

   C_export_file is similar to
  generate_file fic = ‹code›
    export_generated_files fic›, except that
     code› refers to the dump of all existing previous C code in the current
    theory (parent theories are ignored),
     and any ML antiquotations in code› are not analyzed by
    generate_file (in contrast with its default behavior). ›

(* again,  @{attribute Cenv0}  and  @{attribute Crule0} cause LaTeX errors... *)
text  @{attribute C_lexer_trace} indicates whether the list of C
  tokens associated to the source text should be output (that list is
  computed during the lexing phase).

   @{attribute C_parser_trace} indicates whether the stack
  forest of Shift-Reduce node should be output (it is the final stack
  which is printed, i.e., the one taken as soon as the parsing
  terminates).

   @{attribute C_ML_verbose} indicates whether nested
  ML commands are acting similarly as
  their default verbose configuration in top-level.

   Cenv0 makes the start of a C
  command (e.g., C_file,
  C) initialized with the environment of
  the previous C command if existing.

   Crule0 sets the root syntactic category in which the parser starts.
  C commands (e.g., C_file, C). Possible  values are:
        "expression"›, 
        "statement"›,
        "external_declaration"› and 
        "translation_unit"› (the default)
›

subsection ‹Predefined Annotation Commands inside the C context (C Annotation commands)›

text rail‹
    (@@{annotation "#ML_file"} | @@{annotation ML_file} | @@{annotation "ML_file⇓"} |
      @@{annotation "#C_file"} | @@{annotation C_file} | @@{annotation "C_file⇓"}) @{syntax name} ';'?
    ;
    (@@{annotation "#ML"} | @@{annotation ML} | @@{annotation "ML⇓"} |
      @@{annotation "#setup"} | @@{annotation setup} | @@{annotation "setup⇓"} |
      @@{annotation "≈setup"} | @@{annotation "≈setup⇓"} |
      @@{annotation "#C"} | @@{annotation C} | @@{annotation "C⇓"} |
      @@{annotation "#C_export_boot"} | @@{annotation C_export_boot} | @@{annotation "C_export_boot⇓"}) @{syntax text}
    ;
    (@@{annotation "#C_export_file"} | @@{annotation C_export_file} | @@{annotation "C_export_file⇓"} |
     @@{annotation highlight} | @@{annotation "highlight⇓"})
    ;

   C_theory_textML_file, C_theory_textC_file,
  C_theory_textML, C_theory_textsetup,
  C_theory_textC, C_theory_textC_export_boot, and
  C_theory_textC_export_file behave similarly as the respective outer commands
  ML_file, C_file,
  ML, setup,
  C, C_export_boot,
  C_export_file.

   C_theory_text≈setup ‹f'› has the same semantics
  as C_theory_textsetup ‹f› whenever term stack top
  env. f' stack top env = f. In particular, depending on where the annotation
  C_theory_text≈setup ‹f'› is located in the C code, the
  additional values stack›, top› and env› can drastically
  vary, and then can be possibly used in the body of f'› for implementing new
  interactive features (e.g., in contrast to f›, which by default does not have the
  possibility to directly use the information provided by stack›, top›
  and env›).

   C_theory_texthighlight changes the background color of the C tokens pointed by the command.

   C_theory_text#ML_file,
  C_theory_text#C_file, C_theory_text#ML,
  C_theory_text#setup,
  C_theory_text#C,
  C_theory_text#C_export_boot, and
  C_theory_text#C_export_file
  behave similarly as the respective (above inner) commands
  C_theory_textML_file,
  C_theory_textC_file, C_theory_textML,
  C_theory_textsetup,
  C_theory_textC,
  C_theory_textC_export_boot, and
  C_theory_textC_export_file
  except that their evaluations happen as earliest as possible.

   C_theory_textML_file⇓,
  C_theory_textC_file⇓, C_theory_textML⇓,
  C_theory_textsetup⇓,
  C_theory_text≈setup⇓, C_theory_textC⇓,
  C_theory_textC_export_boot⇓,
  C_theory_textC_export_file⇓, and
  C_theory_texthighlight⇓
  behave similarly as the respective (above inner) commands
  C_theory_textML_file, C_theory_textC_file,
  C_theory_textML, C_theory_textsetup,
  C_theory_text≈setup, C_theory_textC,
  C_theory_textC_export_boot,
  C_theory_textC_export_file, and
  C_theory_texthighlight
  except that their evaluations happen as latest as possible.
›

subsection ‹Inner Directive Commands›

text  Among the directives provided as predefined in Isabelle/C, we currently have:
  C#define _ and C#undef _. In particular, for the case of
  C#define _, rewrites are restricted to variable-form macros: support of
  functional macros is not yet provided.
   In Isabelle/C, not-yet-defined directives (such as C#include _ or
  C#if
  #endif, etc.) do not make the parsing fail, but are treated as ``free variable commands''.
›

section ‹Quick Start (for People More Familiar with C than Isabelle)›

text  Assuming we are working with Isabelle 2021
🌐‹https://isabelle.in.tum.de/website-Isabelle2021/dist/Isabelle2021_linux.tar.gz›, the shortest way to
start programming in C is to open a new theory file with the shell-command:

‹$ISABELLE_HOME/bin/isabelle jedit -d $AFP_HOME/thys Scratch.thy›

where ‹$ISABELLE_HOME› is the path of the above extracted Isabelle source,
and ‹$AFP_HOME› is the downloaded content of
🌐‹https://foss.heptapod.net/isa-afp/afp-2021›.‹This folder
particularly contains the Isabelle/C project, located in
🌐‹https://foss.heptapod.net/isa-afp/afp-2021/-/tree/branch/default/thys/Isabelle_C›. To inspect
the latest developper version, one can also replace ‹$AFP_HOME/thys› by the
content downloaded from 🌐‹https://gitlab.lisn.upsaclay.fr/burkhart.wolff/Isabelle_C›.›
 The next step is to copy this minimal content inside the newly opened window:
‹theory Scratch imports Isabelle_C.C_Main begin C ‹
// C code
› end›
 ‹Quod Erat Demonstrandum!› This already enables the support of C code inside the special
brackets ``‹‹››'', now depicted as
``‹››'' for readability reasons. ›

text_raw ‹
\begin{figure}
  \centering
\includegraphics[width=\textwidth]{figures/C-export-example}
  \caption{Making the File Browser Pointing to the Virtual File System}
  \label{fig:file-bro}
\end{figure}
›

text ‹ Additionally, Isabelle/C comes with several functionalities that can be alternatively
explored:
 To write theorems and proofs along with C code, the special C comment
C/*@ (* Isabelle content *) */ can be used at any position where C comments are
usually regularly allowed. At the time of writing, not yet all Isabelle commands can be written in C
comments, and certain proof-solving-command combinations are also not yet implemented --- manual
registration of commands to retrieve some more or less native user-experience remains possible
though. Generally, the kind of content one can write in C comments should be arbitrary. The
exhaustive list of Isabelle commands is provided in the accompanying above archive, for example in
🗀‹$ISABELLE_HOME/src/Doc/Isar_Ref› or
🌐‹https://isabelle.in.tum.de/doc/isar-ref.pdf›.

 Instead of starting from scratch, any existing C files can also be opened with Isabelle/C,
it suffices to replace:

\begin{tabular}{c}
 ‹C› ‹ /* C */ › \\
 by \\
 ‹C_file› ‹~/file.c›
\end{tabular}

Once done, one can press a CTRL-like key while hovering the mouse over the file name, then followed
by a click on it to open a new window loading that file.

 After a ‹C› ‹ /* C */ ›
command, one has either the possibility to keep the content as such in the theory file, or use
‹C_export_file› to export all previous C content into a ``real'' C file.

Note that since Isabelle2019, Isabelle/C uses a virtual file-system. This has the consequence, that
some extra operations are needed to export a file generated into the virtual file-system of Isabelle
into the ``real'' file-system. First, the ‹C_export_file› command needs to
be activated, by putting the cursor on the command. This leads to the following message in the
output window: ‹See theory exports "C/*/*.c"› (see
\autoref{fig:file-bro}). By clicking on theory exports› in
this message, Isabelle opens a File Browser› showing the content of the virtual
file-system in the left window. Selecting and opening a generated file in the latter lets jEdit
display it in a new buffer, which gives the possibility to export this file via ``File
→ Save As…›'' into the real file-system. ›

section ‹Case Study: Mapping on the Parsed AST via C99 ASTs›

text ‹ In this section, we give a concrete example of a situation where one is interested to
do some automated transformations on the parsed AST, such as changing the type of every encountered
variables from Cint _; to Cint _ [];. The main theory of
interest here is theoryIsabelle_C.C_Parser_Language, where the C grammar is
loaded, in contrast to theoryIsabelle_C.C_Lexer_Language which is only dedicated
to build a list of C tokens. As another example,
theoryIsabelle_C.C_Parser_Language also contains the portion of the code
implementing the report to the user of various characteristics of encountered variables during
parsing: if a variable is bound or free, or if the declaration of a variable is made in the global
topmost space or locally declared in a function. ›

subsection ‹Prerequisites›

text ‹ Even if 🗏‹../generated/c_grammar_fun.grm.sig› and
🗏‹../generated/c_grammar_fun.grm.sml› are files written in ML syntax, we have
actually modified 🗀‹../../src_ext/mlton/lib/mlyacc-lib› in such a way that at run
time, the overall loading and execution of theoryIsabelle_C.C_Parser_Language
will mimic all necessary features of the Haskell parser generator Happy
🌐‹https://www.haskell.org/happy/doc/html/index.html›,
including any monadic interactions between the lexing
(theoryIsabelle_C.C_Lexer_Language) and parsing part
(theoryIsabelle_C.C_Parser_Language).

This is why in the remaining part, we will at least assume a mandatory familiarity with Happy (e.g.,
the reading of ML-Yacc's manual can happen later if wished
🌐‹https://www.cs.princeton.edu/~appel/modern/ml/ml-yacc/manual.html›). In
particular, we will use the term ‹rule code› to designate ‹a
Haskell expression enclosed in braces›
🌐‹https://www.haskell.org/happy/doc/html/sec-grammar.html›.
›

subsection ‹Structure of theoryIsabelle_C.C_Parser_Language

text ‹ In more detail, theoryIsabelle_C.C_Parser_Language can be seen as being
principally divided into two parts:
   a first part containing the implementation of
  ML_structureC_Grammar_Rule_Lib, which provides the ML implementation library
  used by any rule code written in the C grammar
  🌐‹https://github.com/visq/language-c/blob/master/src/Language/C/Parser/Parser.y›
  (🗏‹../generated/c_grammar_fun.grm.sml›).
   a second part implementing ML_structureC_Grammar_Rule_Wrap, providing
  one wrapping function for each rule code, for potentially complementing the rule code with an
  additional action to be executed after its call. The use of wrapping functions is very optional:
  by default, they are all assigned as identity functions.

The difference between ML_structureC_Grammar_Rule_Lib and
ML_structureC_Grammar_Rule_Wrap relies in how often functions in the two
structures are called: while building subtree pieces of the final AST, grammar rules are free to
call any functions in ML_structureC_Grammar_Rule_Lib for completing their
respective tasks, but also free to not use ML_structureC_Grammar_Rule_Lib at
all. On the other hand, irrespective of the actions done by a rule code, the function associated to
the rule code in ML_structureC_Grammar_Rule_Wrap is retrieved and always executed
(but a visible side-effect will likely mostly happen whenever one has provided an implementation far
different from MLI). ›

text ‹ Because the grammar
🌐‹https://github.com/visq/language-c/blob/master/src/Language/C/Parser/Parser.y›
(🗏‹../generated/c_grammar_fun.grm.sml›) has been defined in such a way that
computation of variable scopes are completely handled by functions in
ML_structureC_Grammar_Rule_Lib and not in rule code (which are just calling
functions in ML_structureC_Grammar_Rule_Lib), it is enough to overload functions
in ML_structureC_Grammar_Rule_Lib whenever it is wished to perform new actions
depending on variable scopes, for example to do a specific PIDE report at the first time when a C
variable is being declared. In particular, functions in
ML_structureC_Grammar_Rule_Lib are implemented in monadic style, making a
subsequent modification on the parsing environment theoryIsabelle_C.C_Environment possible
(whenever appropriate) as this last is carried in the monadic state.

Fundamentally, this is feasible because the monadic environment fulfills the property of being
always properly enriched with declared variable information at any time, because we assume
   working with a language where a used variable must be at most declared or redeclared
  somewhere before its actual usage,
   and using a parser scanning tokens uniquely, from left to right, in the same order as
  the execution of rule code actions. ›

subsubsection ‹Example›

text ‹ As illustration, MLC_Grammar_Rule_Lib.markup_var o C_Ast.Left is
(implicitly) called by a rule code while a variable being declared is encountered. Later, a call to
MLC_Grammar_Rule_Lib.markup_var o C_Ast.Right in
ML_structureC_Grammar_Rule_Wrap (actually, in
ML_structureC_Grammar_Rule_Wrap_Overloading) is made after the execution of
another rule code to signal the position of a variable in use, together with the information
retrieved from the environment of the position of where it is declared. ›

text ‹ In more detail, the second argument of
MLC_Grammar_Rule_Lib.markup_var is among other of the form:
ML_typePosition.T * {global: bool}, where particularly the field
ML#global : C_Env.markup_ident -> bool of the record is informing
MLC_Grammar_Rule_Lib.markup_var if the variable being reported (at either first
declaration time, or first use time) is global or local (inside a function for instance). Because
once declared, the property ML#global : C_Env.markup_ident -> bool of a variable
does not change afterwards, it is enough to store that information in the monadic environment:
 ‹Storing the information at declaration time› The part deciding if a
variable being declared is global or not is implemented in
MLC_Grammar_Rule_Lib.doDeclIdent and
MLC_Grammar_Rule_Lib.doFuncParamDeclIdent. The two functions come from
🌐‹https://github.com/visq/language-c/blob/master/src/Language/C/Parser/Parser.y›
(so do any functions in ML_structureC_Grammar_Rule_Lib). Ultimately, they are
both calling MLC_Grammar_Rule_Lib.markup_var o C_Ast.Left at some point.
 ‹Retrieving the information at use time›
MLC_Grammar_Rule_Lib.markup_var o C_Ast.Right is only called by
MLC_Grammar_Rule_Wrap.primary_expression1, while treating a variable being
already declared. In particular the second argument of
MLC_Grammar_Rule_Lib.markup_var is just provided by what has been computed by the
above point when the variable was declared (e.g., the globality versus locality
information). ›

subsection ‹Rewriting of AST node›

text ‹ For the case of rewriting a specific AST node, from subtree T1› to
subtree T2›, it is useful to zoom on the different parsing evaluation stages, as well
as make precise when the evaluation of semantic back-ends are starting.

 Whereas annotations in Isabelle/C code have the potential of carrying arbitrary ML code (as
in theoryIsabelle_C.C2), the moment when they are effectively evaluated
will not be discussed here, because to closely follow the semantics of the language in embedding (so
C), we suppose comments --- comprising annotations --- may not affect any parsed tokens living
outside comments. So no matter when annotations are scheduled to be future evaluated in Isabelle/C,
the design decision of Isabelle/C is to not let a code do directive-like side-effects in
annotations, such as changing T1› to T2› inside annotations.

 To our knowledge, the sole category of code having the capacity to affect incoming stream
of tokens are directives, which are processed and evaluated before the ``major'' parsing step
occurs. Since in Isabelle/C, directives are relying on ML code, changing an AST node from
T1› to T2› can then be perfectly implemented in directives.

 After the directive (pre)processing step, the main parsing happens. But since what are
driving the parsing engine are principally rule code, this step means to execute
ML_structureC_Grammar_Rule_Lib and
ML_structureC_Grammar_Rule_Wrap, i.e., rules in
🗏‹../generated/c_grammar_fun.grm.sml›.

 Once the parsing finishes, we have a final AST value, which topmost root type entry-point
constitutes the last node built before the grammar parser
🌐‹https://github.com/visq/language-c/blob/master/src/Language/C/Parser/Parser.y›
ever entered in a stop state. For the case of a stop acceptance state, that moment happens when we
reach the first rule code building the type ML_typeC_Ast.CTranslUnit, since there
is only one possible node making the parsing stop, according to what is currently written in the C
grammar. (For the case of a state stopped due to an error, it is the last successfully built value
that is returned, but to simplify the discussion, we will assume in the rest of the document the
parser is taking in input a fully well-parsed C code.)

 By ‹semantic back-ends›, we denote any kind of ``relatively
efficient'' compiled code generating Isabelle/HOL theorems, proofs, definitions, and so with the
potential of generally generating Isabelle packages. In our case, the input of semantic back-ends
will be the type ML_typeC_Ast.CTranslUnit (actually, whatever value provided by
the above parser). But since our parser is written in monadic style, it is as well possible to give
slightly more information to semantic back-ends, such as the last monadic computed state, so
including the last state of the parsing environment. ›

text ‹ Generally, semantic back-ends can be written in full ML starting from
ML_typeC_Ast.CTranslUnit, but to additionally support formalizing tasks requiring
to start from an AST defined in Isabelle/HOL, we provide an equivalent AST in HOL in the project,
such as the one obtained after loading
🌐‹https://gitlab.lisn.upsaclay.fr/frederictuong/isabelle_contrib/-/blob/master/Citadelle/doc/Meta_C_generated.thy›.
(In fact, the ML AST is just generated from the HOL one.) ›



text ‹
Based on the above information, there are now several ‹equivalent› ways to
proceed for the purpose of having an AST node be mapped from T1› to
T2›. The next bullets providing several possible solutions to follow are particularly
sorted in increasing action time.

 ‹Before even starting the Isabelle system.› A first approach would be
to modify the C code in input, by adding a directive C#define _ _ performing the
necessary rewrite.

 ‹Before even starting the Isabelle system.› As an alternative of
changing the C code, one can modify
🌐‹https://github.com/visq/language-c/blob/master/src/Language/C/Parser/Parser.y›
by hand, by explicitly writing T2› at the specific position of the rule code
generating T1›. However, this solution implies to re-generate
🗏‹../generated/c_grammar_fun.grm.sml›.

 ‹At grammar loading time, while the source of Isabelle/C is still being
processed.› Instead of modifying the grammar, it should be possible to first locate which
rule code is building T1›. Then it would remain to retrieve and modify the respective
function of ML_structureC_Grammar_Rule_Wrap executed after that rule code, by
providing a replacement function to be put in
ML_structureC_Grammar_Rule_Wrap_Overloading. However, as a design decision,
wrapping functions generated in 🗏‹../generated/c_grammar_fun.grm.sml› have only
been generated to affect monadic states, not AST values. This is to prevent an erroneous replacement
of an end-user while parsing C code. (It is currently left open about whether this feature will be
implemented in future versions of the parser...)

 ‹At directive setup time, before executing any
C command of interest.› Since the behavior of directives can be
dynamically modified, this solution amounts to change the semantics of any wished directive,
appearing enough earlier in the code. (But for the overall code be in the end mostly compatible with
any other C preprocessors, the implementation change has to be somehow at least consistent with how
a preprocessor is already expected to treat an initial C un(pre)processed code.) For example, the
current semantics of C#undef _ depends on what has been registered in
MLC_Context.directive_update (see theoryIsabelle_C.C_Command).

 ‹After parsing and obtaining a constructive value.› Another solution
consists in directly writing a mapping function acting on the full AST, so writing a ML function of
type ML_typeC_Ast.CTranslUnit -> C_Ast.CTranslUnit (or a respective HOL function)
which has to act on every constructor of the AST (so in the worst case about hundred of constructors
for the considered AST, i.e., whenever a node has to be not identically returned). However, as we
have already implemented a conversion function from ML_typeC_Ast.CTranslUnit
(subset of C11) to a subset AST of C99, it might be useful to save some effort by starting from this
conversion function, locate where T1› is pattern-matched by the conversion function,
and generate T2› instead.

As example, the conversion function MLC_Ast.main is particularly used to connect
the C11 front-end to the entry-point of AutoCorres in
‹l4v/src/tools/c-parser/StrictCParser.ML›.

 ‹At semantic back-ends execution time.› The above points were dealing
with the cases where modification actions were all occurring before getting a final
ML_typeC_Ast.CTranslUnit value. But this does not mean it is forbidden to make
some slight adjustments once that resulting ML_typeC_Ast.CTranslUnit value
obtained. In particular, it is the tasks of semantic back-ends to precisely work with
ML_typeC_Ast.CTranslUnit as starting point, and possibly translate it to another
different type. So letting a semantic back-end implement the mapping from T1› to
T2› would mean here to first understand the back-end of interest's architecture, to
see where the necessary minimal modifications must be made.

By taking l4v as a back-end example, its integration with Isabelle/C first starts with translating
ML_typeC_Ast.CTranslUnit to l4v's default C99 AST. Then various analyses on the
obtained AST are performed in
🌐‹https://github.com/seL4/l4v/tree/master/tools/c-parser› (the reader interested
in the details can start by further exploring the ML files loaded by
🌐‹https://github.com/seL4/l4v/blob/master/tools/c-parser/CTranslation.thy›). In
short, to implement the mapping from T1› to T2› in the back-end part,
one can either:
   modify the translation from ML_typeC_Ast.CTranslUnit to C99,
   or modify the necessary ML files of interests in the l4v project.
›

text ‹ More generally, to better inspect the list of rule code really executed when a C code
is parsed, it might be helpful to proceed as in theoryIsabelle_C.C2, by activating
declare[[C_parser_trace]]. Then, the output window will display the
sequence of Shift Reduce actions associated to the C command of
interest.
›

text‹NOTE› :the C99 library is part of the configuration with AutoCorres, which is
currently only available for Isabelle2019.›

section ‹Case Study: Mapping on the Parsed AST via the ML_structureC11_Ast_Lib

MLopen C11_Ast_Lib
text‹A simpler alternative for connecting Isabelle/C to a semantic backend is the use of the
ML_structureC11_Ast_Lib, an API for the C11 abstract syntax. Among a number of utilities, 
it provides a family of iterators (or: hylomorphisms, generalized fold operators, or whatever
terminology you prefer). There is a fold-operator for each C11 Ast-category :
 MLfold_cArraySize: 'a -> (node_content->'a->'b->'b) -> 'a C_Ast.cArraySize -> 'b -> 'b›
 MLfold_cCompoundBlockItem: (node_content->'a->'b->'b) -> 'a C_Ast.cCompoundBlockItem->'b->'b›
 MLfold_cArraySize: 'a -> (node_content->'a->'b->'b) -> 'a C_Ast.cArraySize -> 'b -> 'b›
 MLfold_cDeclaration: (node_content->'a->'b->'b) -> 'a C_Ast.cDeclaration -> 'b -> 'b›
 MLfold_cExpression: (node_content->'a->'b->'b) -> 'a C_Ast.cExpression -> 'b -> 'b›
 MLfold_cStatement: (node_content->'a->'b->'b)  -> 'a C_Ast.cStatement -> 'b -> 'b›
 MLfold_cExternalDeclaration: (node_content->'a->'b->'b) -> 'a C_Ast.cExternalDeclaration->'b->'b›
 MLfold_cTranslationUnit: (node_content->'a->'b->'b) -> 'a C_Ast.cTranslationUnit -> 'b -> 'b›
 etc.
›
text‹Here, ML_typenode_content  is a data-structure providing untyped and uniform 
information on which rule has been applied, and what kind of particular decoration appears in the
C11-Ast. ›

text‹ This allows for a simple programming of queries like "get the list of identifiers" 
directly on the C11-Ast. Moreover, it is pretty straight-forward to program a compiler to 
λ›-terms for a specific semantic interpretation in Isabelle/HOL. A simple example is here:›

text@{theory_text [display]
declare [[Crule0 = "expression"]]
 C‹a + b * c - a / b›
 ML‹val ast_expr = @{C11_CExpr}›}

@{verbatim [display]‹
fun selectIdent0Binary (a as { tag, sub_tag, args }:C11_Ast_Lib.node_content) 
                       (b:  C_Ast.nodeInfo ) 
                       (c : term list)=  
    case tag of
      "Ident0" => (node_content_2_free a)::c
     |"CBinary0" => (case (drop_dark_matter sub_tag, c) of 
                      ("CAddOp0",b::a::R) => (Const("Groups.plus_class.plus",dummyT) $ a $ b :: R)
                    | ("CMulOp0",b::a::R) => (Const("Groups.times_class.times",dummyT) $ a $ b :: R)
                    | ("CDivOp0",b::a::R) => (Const("Rings.divide_class.divide",dummyT) $ a $ b :: R)
                    | ("CSubOp0",b::a::R) => (Const("Groups.minus_class.minus",dummyT) $ a $ b :: R)
                    | _ => (writeln ("sub_tag all " ^sub_tag^" :>> "^ @{make_string} c);c ))
     | _ => c;


val S =  (C11_Ast_Lib.fold_cExpression selectIdent0Binary ast_expr []);
(* gives the (untyped) equivalent of : *)
val S' = @{term "a + b * c - a / b"};
›}

text‹This snippet is drawn from the C11-Example shown in Appendix III.›

section ‹Known Limitations, Troubleshooting›
subsection ‹The Document Model of the Isabelle/PIDE (applying since at least Isabelle 2019)›
subsubsection ‹Introduction›

text ‹ Embedding C directives in C code is an act of common practice in numerous applications,
as well as largely highlighted in the C standard. As an example of frequently encountered
directives, #include <some_file.c>› is used to insert the content of
some_file.c› at the place where it is written. In Isabelle/C, we can also write a C
code containing directives like #include›, and generally the PIDE reporting of
directives is supported to a certain extent. Yet, the dynamic inclusion of arbitrary file with
#include› is hurting a certain technological barrier. This is related to how the
document model of Isabelle 2019 is functioning, but also to the design decisions behind the
implementation of C‹ .. ›. Thus, providing a complete
semantic implementation of #include› might be not as evident as usual, if not more
dangerous, i.e. ``something requiring a manual intervention in the source of Isabelle 2019''. In the
next part, we show why in our current implementation of Isabelle/C there is no way for user
programmed extensions to exploit implicit dependencies between sub-documents in pure ML: a
sub-document referred to via #include <some_file>› will not lead to a reevaluation of
a C‹ .. › command whenever modified.›

subsubsection ‹Embedding a language in Isabelle/PIDE›

text ‹
To clarify why the way a language being embedded in Isabelle is influencing the interaction between
a future parser of the language with the Isabelle's document model, we recall the two ``different''
ways of embedding a language in Isabelle/PIDE.

At its most basic form, the general syntactic scope of an Isabelle/Isar document can be seen as
being composed of two syntactic alternations of editing space: fragments of the inner syntax
language, themselves part of the more general outer syntax (the inner syntax is implemented as an
atomic entity of the outer language); see
🗏‹~~/src/Doc/Isar_Ref/Outer_Syntax.thy›. So strictly speaking, when attempting
to support a new language L› in Isabelle, there is always the question of fine-grain
estimating which subsets of L› will be represented in the outer syntax part, and if it
possibly remains any left subsets to be represented on the more inner (syntactic) part.

Generally, to answer this question, there are several criteria to consider:
   Are there any escaping symbols conflicting between L› and the outer
  (syntax) language, including for example the ASCII ‹"› or
  ‹`›?
   Is L› a realistic language, i.e. more complex than any combinations of
  outer named tokens that can be ever covered in terms of expressivity power (where the list of
  outer named tokens is provided in 🗏‹~~/src/Doc/Isar_Ref/Outer_Syntax.thy›)?
   Is it preferable of not altering the outer syntax language with too specific and
  challenging features of L›? This is particularly true since in Isabelle 2019, there
  is no way of modifying the outer syntax without making the modifications irremediably happen on
  its source code.

For the above reasons, we have come up in Isabelle/C with the choice of making the full C language
be supported inside the inner syntax allocated space. In particular, this has become all the more
syntactically easy with the introduction of cartouches since Isabelle
2014.‹Fortunately, parsing tokens of C do not strongly conflict with cartouche
delimiter symbols. For example, it should not be ethically wrong in C to write an opening cartouche
symbol (possibly in a C comment) without writing any closing cartouche symbol afterwards. However,
we have not encountered such C code in our tested codebase, and it is a functionality implicitly
rejected by the current parser of Isabelle/C, as it is relying on Isabelle 2019's parser combinator
library for the lexing part.› However, for the case of the C language, certain C directives
like #include› are meant to heavily interact with external files. In particular,
resources would be best utilized if we were taking advantage of the Isabelle's asynchronous document
model for such interaction task. Unfortunately, the inner syntax space only has a minimum
interaction with the document model, compared to the outer syntax one. Otherwise said, be it for
experimenting the inner syntax layer and see how far it can deal with the document layer, or
otherwise reimplementing parts of Isabelle/C in the outer syntax layer, the two solutions are
conducting to do modifications in the Isabelle 2019 source code. ›

text ‹ Note that the language embedding space of C closely
resembles to how ML sources are delimited within a ML
command. Additionally, in ML, one can use antiquotations to also refer to external files
(particularly in formal comments). Still, the problem is still present in ML: referred files are not
loaded in the document model. ›

subsubsection ‹Examples›

text  Commands declared as of type thy_decl› in the theory header are scheduled
  to be executed once. Additionally, they are not tracking the content of file names provided in
  argument, so a change there will not trigger a reevaluation of the associated command. For
  example, even if the type of ML_file is not thy_decl›,
  nothing prevents one to set it with thy_decl› as type. In particular, by doing so,
  it is no more possible to CTRL-hover-click on the file name written after
  ML_file.
   To make a command C› track the content of file›, whenever the
  file is changing, setting C› to be of type thy_load› in the theory
  header is a first step, but not enough. To be effective, file› must also be loaded,
  by either explicitly opening it, or clicking on its name after the command. Examples of commands
  in this situation requiring a preliminary one-time-click include:
  external_file, bibtex_file,
  ML_file.
  Internally, the click is bound to a Scala code invoking a request to make an asynchronous
  dependency to the newly opened document at ML side.
   In terms of recursivity, for the case of a chain of sub-documents of the form
  (a theory file containing: C_file ‹file0.c›)
  ⟹›
  (C file ‹file0.c› containing: C#include <file1.c>)
  ⟹›
  (C file ‹file1.c› containing: C#include <file2.c>)
  ⟹›
  (C file ‹file2.c› containing: C#include <file3.c>), we
  ideally expect a modification in ‹file3.c› be taken into account in all
  ancestor files including the initial theory, provoking the associated command of the theory be
  reevaluated. However in C, directives resolving might be close to Turing-complete. For instance,
  one can also include files based on particular conditional situations: C#if _
    #include <file1>
  #else
    #include <file2>
    #include <file3>
  #endif
   When a theory is depending on other theories (such as
  theoryIsabelle_C.C_Eval depending on
  theoryIsabelle_C.C_Parser_Language and
  theoryIsabelle_C.C_Parser_Annotation), modifying the list of theories in
  importation automatically triggers what the user is expecting: for example, the newly added
  theories are dynamically imported, any change by another external editor makes everything
  consequently propagated.

  Following the internal implementation of the document model engine, we basically distinguish two
  phases of document importation: either at start-up time, or dynamically on user requests. Although
  the case of start-up time can be handled in pure ML side, the language dedicated to express which
  Isabelle theory files to import is less powerful than the close-to-Turing-completeness
  expressivity of C directives. On the other hand, the dynamic importation of files on user requests
  seems to be performed (at the time of writing) through a too high level ML protocol, mostly called
  from Scala side. Due to the fact that Isabelle/C is currently implemented in pure ML, a solution
  also in pure ML would thus sound more natural (although we are not excluding solutions interacting
  with Scala, as long as the resulting can be implemented in Isabelle, preferably outside of its own
  source).›

subsection ‹Parsing Error versus Parsing Correctness›

text ‹ When trying to decide if the next parsing action is a Shift or Reduce action to
perform, the grammar simulator MLLALR_Parser_Eval.parse can actually decide to do
another action: ignore everything and immediately stop the simulation.

If the parser ever decides to stop, this can only be for two reasons:
 The parser is supposed to have correctly finished its parsing task, making it be in an
acceptance state. As acceptance states are encoded in the grammar, it is easy to find if this
information is correct, or if it has to be adjusted in more detail by inspecting
🌐‹https://github.com/visq/language-c/blob/master/src/Language/C/Parser/Parser.y›
(🗏‹../generated/c_grammar_fun.grm.sml›).
 The parser seems to be unable to correctly finish its parsing task. In this case, the user
will see an error be explicitly raised by the prover IDE. However raising an error is just the
default behavior of Isabelle/C: the decision to whether raise interruptive errors ultimately depends
on how front-end commands are implemented (such as C,
C_file, etc.). For instance, similarly as to how outer syntax commands
are implemented, we can imagine a tool implementing a kind of partial parsing, analyzing the longest
sequence of well-formed input, and discarding some strategic next set of faulty tokens with a well
suited informative message, so that the parsing process could be maximally repeated on what is
coming afterwards.

Currently, the default behavior of Isabelle/C is to raise the error defined in
MLC_Module.err at the very first opportunity ‹At the time of
writing it is: ‹No matching grammar rule›.›. The possible solutions to
make the error disappear at the position the error is indicated can be detailed as follows:
   Modifying the C code in input would be a first solution whenever we suspect something is
  making it erroneous (and when we have a reason to believe that the grammar is behaving as it
  should).

   However, we could still get the above error in front of an input where one is usually
  expecting to see not causing a failure. In particular, there are several C features (such as C
  directives) explicitly left for semantic back-ends (pre-) processing, so in general not fully
  semantically processed at parsing time.

  For example, whereas the code C#define i int
  i a; succeeds, replacing its first line with the directive
  C#include <file.c> will not initially work, even if file.c›
  contains C#define i int, as the former directive has been left for semantic
  back-end treatment. One way of solving this would be to modify the C code in input for it to be
  already preprocessed (without directives, for example the C example of
  🌐‹https://gitlab.lisn.upsaclay.fr/burkhart.wolff/Isabelle_C/-/blob/C/C11-BackEnds/AutoCorres_wrapper/examples/TestSEL4.thy› is already provided as
  preprocessed). Another way would be adding a specific new semantic back-end implementing the
  automation of the preprocessing task (as done in
  🌐‹https://gitlab.lisn.upsaclay.fr/burkhart.wolff/Isabelle_C/-/blob/C/C11-BackEnds/AutoCorres_wrapper/examples/IsPrime_linear_CCT.thy›, where the
  back-end explicitly makes a call to cpp› at run-time).

   Ultimately, modifying the grammar with new rules cancelling the exception would only work
  if the problem really relies on the grammar, as it was mentioned for the acceptance state.
  ›

text ‹ In terms of parsing correctness, Isabelle/C provides at least two different parsers:
 a parser limited to C99/C11 code provided in 🗀‹../../C11-FrontEnd› that can
parse certain liberal extensions out of the C
standard~🌐‹http://hackage.haskell.org/package/language-c›;
 and another parser accepting C99/C11/C18 code in 🌐‹https://gitlab.lisn.upsaclay.fr/burkhart.wolff/Isabelle_C/-/tree/C/C18-FrontEnd› that
is close to the C standard while focusing on resolving ambiguities of the
standard~🌐‹https://github.com/jhjourdan/C11parser›~cite"DBLP:journals/toplas/JourdanP17". ›

text ‹ Note that the two parsers are not accepting/rejecting the same range of arbitrary C
code. We have actually already encountered situations where an error is raised by one parser, while
a success is happening with the other parser (and vice-versa). Consequently, in front of a C code,
it can be a good recommendation to try out the parsing with all possible parsers of Isabelle/C. In
any cases, a failure in one or several activated parsers might not be necessarily harmful: it might
also indicate that a wrong parser has been selected, or a semantic back-end not yet supporting
aspects of the C code being parsed. ›

subsection ‹Exporting C Files to the File-System›

text ‹ From the Isabelle/C side, the task is easy, just type:›

C_export_file

text ‹ ... which does the trick and generates a file
‹C_Appendices.c›. But hold on --- where is it? Well, Isabelle/C uses since
version Isabelle2019 a virtual file-system. Exporting from it to the real file-system requires a few
mouse-clicks (unfortunately).

So activating the command C_export_file leads to the output
‹See theory exports "C/*/C_Appendices.c"› (see
\autoref{fig:file-bro}), and clicking on the highlighted
‹theory exports› lets Isabelle display a part of the virtual file-system
(see subwidget left). Activating it in the subwidget lets jEdit open it as an editable file, which
can be exported via ``File → Save As…›'' into the real
file-system. ›

end