From 4fa6d378c0db57b880ff031982c26b31520367a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Tuong?=
 <tuong@users.noreply.github.com>
Date: Sat, 23 May 2020 10:44:44 +0000
Subject: [PATCH] support new space symbols, possibly non-standard

git-svn-id: https://projects.brucker.ch/hol-testgen/svn/HOL-TestGen/trunk/hol-testgen@14197 3260e6d1-4efc-4170-b0a7-36055960796d
---
 C11-FrontEnd/examples/C1.thy          |  3 ++-
 C11-FrontEnd/src/C_Environment.thy    |  2 +-
 C11-FrontEnd/src/C_Lexer_Language.thy | 37 +++++++++++++++++++--------
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/C11-FrontEnd/examples/C1.thy b/C11-FrontEnd/examples/C1.thy
index c7e635d5332..61a9544f105 100644
--- a/C11-FrontEnd/examples/C1.thy
+++ b/C11-FrontEnd/examples/C1.thy
@@ -882,8 +882,9 @@ a a /*#include <>*/ // must not be considered as a directive
 C \<comment> \<open>Universal character names in identifiers and Isabelle symbols\<close> \<open>
 #include <stdio.h>
 int main () {
+  char *  _  = " ";
   char * ó\<^url>ò = "ó\<^url>ò";
-  printf ("%s", ó\<^url>ò);
+  printf ("%s %s", ó\<^url>ò, _ );
 }
 \<close>
 
diff --git a/C11-FrontEnd/src/C_Environment.thy b/C11-FrontEnd/src/C_Environment.thy
index 98ab90a55f1..f16fb38a7ca 100644
--- a/C11-FrontEnd/src/C_Environment.thy
+++ b/C11-FrontEnd/src/C_Environment.thy
@@ -351,7 +351,7 @@ fun make env_lang stream_lang env_tree =
        , stream_hook = []
        , stream_hook_excess = []
        , stream_lang = ( Stream_regular
-                       , map_filter (fn C_Scan.Right (C_Lex.Token (_, (C_Lex.Space, _))) => NONE
+                       , map_filter (fn C_Scan.Right (C_Lex.Token (_, (C_Lex.Space _, _))) => NONE
                                       | C_Scan.Right (C_Lex.Token (_, (C_Lex.Comment _, _))) => NONE
                                       | C_Scan.Right tok => SOME (C_Scan.Right tok)
                                       | C_Scan.Left antiq => SOME (C_Scan.Left antiq))
diff --git a/C11-FrontEnd/src/C_Lexer_Language.thy b/C11-FrontEnd/src/C_Lexer_Language.thy
index dc7982fcac0..8a1f90c1910 100644
--- a/C11-FrontEnd/src/C_Lexer_Language.thy
+++ b/C11-FrontEnd/src/C_Lexer_Language.thy
@@ -95,9 +95,13 @@ fun is_ascii_letdig s =
 fun is_ascii_identifier s =
   size s > 0 andalso forall_string is_ascii_letdig s;
 
-val is_ascii_blank_no_line =
-  fn " " => true | "\t" => true | "\^K" => true | "\f" => true
-    | _ => false;
+val ascii_blank_no_line =
+  [ ([" "], NONE)
+  , (["\t", "\^K", "\f"], SOME "Space symbol")
+  , (["\194\160"], SOME "Non-standard space symbol") ]
+
+fun is_ascii_blank_no_line s =
+  exists (#1 #> exists (curry op = s)) ascii_blank_no_line
 end
 \<close>
 
@@ -624,7 +628,7 @@ datatype token_kind =
   String of token_kind_string |
   File of token_kind_string |
   (**)
-  Space | Comment of token_kind_comment | Sharp of int |
+  Space of (string * Symbol_Pos.T) option list | Comment of token_kind_comment | Sharp of int |
   (**)
   Unknown | Error of string * token_group | Directive of token_kind_directive | EOF
 
@@ -785,7 +789,10 @@ val warn_string =
               ("Out of the supported range (character number " ^ Int.toString n ^ ")"
                ^ Position.here pos1))
 
-
+val warn_space =
+  app (fn SOME (msg, (s, pos)) =>
+            Output.information (msg ^ " " ^ @{make_string} s ^ Position.here pos)
+        | _ => ())
 
 fun warn_unknown pos = Output.information ("Unknown symbol" ^ Position.here pos)
 
@@ -800,6 +807,7 @@ val warn = fn
   | Token (_, (Char (_, l), _)) => warn_string l
   | Token (_, (String (_, l), _)) => warn_string l
   | Token (_, (File (_, l), _)) => warn_string l
+  | Token (_, (Space l, _)) => warn_space l
   | Token ((pos, _), (Unknown, _)) => warn_unknown pos
   | Token (_, (Comment (Comment_suspicious (SOME (explicit, msg, _))), _)) =>
       (if explicit then warning else tracing) msg
@@ -921,7 +929,14 @@ fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg);
 
 fun !!!! msg = C_Symbol_Pos.!!!! (fn () => err_prefix ^ msg);
 
-val many1_blanks_no_line = many1 C_Symbol.is_ascii_blank_no_line
+val many1_blanks_no_line =
+  Scan.repeat1
+    (one C_Symbol.is_ascii_blank_no_line
+     >> (fn (s, pos) => 
+          List.find (#1 #> exists (curry op = s)) C_Symbol.ascii_blank_no_line
+          |> the
+          |> #2
+          |> Option.map (rpair (s, pos))))
 
 (* identifiers *)
 
@@ -1213,14 +1228,14 @@ val scan_sharp1 = $$$ "#"
 val scan_sharp2 = $$$ "#" @@@ $$$ "#"
 
 val scan_directive =
-  let val f_filter = fn Token (_, (Space, _)) => true
+  let val f_filter = fn Token (_, (Space _, _)) => true
                       | Token (_, (Comment _, _)) => true
                       | Token (_, (Error _, _)) => true
                       | _ => false
       val sharp1 = scan_sharp1 >> token (Sharp 1)
   in    (sharp1 >> single)
     @@@ Scan.repeat (   scan_token scan_file I
-                     || scan_fragment (many1_blanks_no_line >> token Space)
+                     || scan_fragment (scan_token many1_blanks_no_line Space)
                                       comments
                                       (scan_sharp2 >> token (Sharp 2) || sharp1)
                                       I)
@@ -1432,8 +1447,8 @@ val scan_ml =
     let
       fun non_blanks st scan = scan >> pair st 
       fun scan_frag st =
-        scan_fragment (   C_Basic_Symbol_Pos.newline >> token Space >> pair true
-                       || many1_blanks_no_line >> token Space >> pair st)
+        scan_fragment (   scan_token (C_Basic_Symbol_Pos.newline >> K [NONE]) Space >> pair true
+                       || scan_token many1_blanks_no_line Space >> pair st)
                       (non_blanks st comments)
                       ((scan_sharp2 || scan_sharp1) >> token Keyword)
                       (non_blanks false)
@@ -1459,7 +1474,7 @@ fun reader scan syms =
         let
           val pos1 = List.last syms |-> Position.advance;
           val pos2 = Position.advance Symbol.space pos1;
-        in [Token (Position.range (pos1, pos2), (Space, Symbol.space))] end;
+        in [Token (Position.range (pos1, pos2), (Space [NONE], Symbol.space))] end;
 
     val backslash1 =
           $$$ "\\" @@@ many C_Symbol.is_ascii_blank_no_line @@@ C_Basic_Symbol_Pos.newline
-- 
GitLab