From 7bcb0460982307e0e38e75c6037aefe63a3ddce8 Mon Sep 17 00:00:00 2001 From: Mikael Mayer Date: Thu, 8 Sep 2022 15:17:22 -0500 Subject: [PATCH 1/8] Feat: Support for full UTF8 parsing by default. This PR enables support for parsing any UTF-8 string in a source file and in comments. --- Source/Dafny/Coco/Scanner.frame | 3 +++ Source/DafnyPipeline.Test/Trivia.cs | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Source/Dafny/Coco/Scanner.frame b/Source/Dafny/Coco/Scanner.frame index 423f113be06..3d8b6d1d5f7 100644 --- a/Source/Dafny/Coco/Scanner.frame +++ b/Source/Dafny/Coco/Scanner.frame @@ -354,6 +354,9 @@ public class Scanner { } buffer = new UTF8Buffer(buffer); col = 0; NextCh(); + } else { + // We consider all files to be UTF-8 by default + buffer = new UTF8Buffer(buffer); } pt = tokens = new Token(); // first token is a dummy } diff --git a/Source/DafnyPipeline.Test/Trivia.cs b/Source/DafnyPipeline.Test/Trivia.cs index ecfdc10c7e9..688c7a2957c 100644 --- a/Source/DafnyPipeline.Test/Trivia.cs +++ b/Source/DafnyPipeline.Test/Trivia.cs @@ -24,7 +24,7 @@ enum Newlines { LF, CR, CRLF }; foreach (Newlines newLinesType in Enum.GetValues(typeof(Newlines))) { currentNewlines = newLinesType; var programString = @" -// Comment before +// Comment ∈ before module Test // Module docstring {} @@ -92,7 +92,7 @@ ensures true var f = defaultClass.Members[1]; var g = defaultClass.Members[2]; var m = defaultClass.Members[3]; - AssertTrivia(moduleTest, "\n// Comment before\n", " // Module docstring\n"); + AssertTrivia(moduleTest, "\n// Comment ∈ before\n", " // Module docstring\n"); AssertTrivia(trait1, "/** Trait docstring */\n", " "); AssertTrivia(trait2, "// Just a comment\n", "\n// Trait docstring\n"); AssertTrivia(subsetType, "// This is attached to n\n", "\n// This is attached to n as well\n\n"); From 065c7e09f4a71730f5ee809f001a72ae1d7b1bcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=C3=ABl=20Mayer?= Date: Thu, 8 Sep 2022 15:19:35 -0500 Subject: [PATCH 2/8] Update RELEASE_NOTES.md --- RELEASE_NOTES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 29d7149253e..b6c090e1756 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,6 @@ # Upcoming +- feat: Support for parsing UTF8 characters in code and comments (https://github.com/dafny-lang/dafny/pull/2717) - feat: Command-line arguments are now available from `Main` in Dafny programs, using `Main(args: seq)` (https://github.com/dafny-lang/dafny/pull/2594) - feat: generate warning when 'old' has no effect (https://github.com/dafny-lang/dafny/pull/2610) - fix: Missing related position in failing precondition (https://github.com/dafny-lang/dafny/pull/2658) From 6cb6d5ca611f2ea8af6433125da7f73671d5f169 Mon Sep 17 00:00:00 2001 From: Mikael Mayer Date: Wed, 14 Sep 2022 09:17:29 -0500 Subject: [PATCH 3/8] Added a test case --- Test/dafny0/PrintUTF8.dfy | 6 ++++++ Test/dafny0/PrintUTF8.dfy.expect | 3 +++ 2 files changed, 9 insertions(+) create mode 100644 Test/dafny0/PrintUTF8.dfy create mode 100644 Test/dafny0/PrintUTF8.dfy.expect diff --git a/Test/dafny0/PrintUTF8.dfy b/Test/dafny0/PrintUTF8.dfy new file mode 100644 index 00000000000..5d4e376b19d --- /dev/null +++ b/Test/dafny0/PrintUTF8.dfy @@ -0,0 +1,6 @@ +// RUN: %dafny /compile:3 "%s" > "%t" +// RUN: %diff "%s.expect" "%t" + +method Main() { + print "Mikaël fixed UTF8\n"; +} diff --git a/Test/dafny0/PrintUTF8.dfy.expect b/Test/dafny0/PrintUTF8.dfy.expect new file mode 100644 index 00000000000..52a23e906cc --- /dev/null +++ b/Test/dafny0/PrintUTF8.dfy.expect @@ -0,0 +1,3 @@ + +Dafny program verifier finished with 0 verified, 0 errors +Mikaël fixed UTF8 From 25d1860ad14e40d064f3deaeafe34c9d5890d8e2 Mon Sep 17 00:00:00 2001 From: Mikael Mayer Date: Thu, 15 Sep 2022 09:57:19 -0500 Subject: [PATCH 4/8] Better release notes --- RELEASE_NOTES.md | 2 +- docs/DafnyRef/Grammar.md | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 3cf0fa80518..fc6b251887d 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,6 @@ # Upcoming -- feat: Support for parsing UTF8 characters in code and comments (https://github.com/dafny-lang/dafny/pull/2717) +- feat: Support for parsing UTF8 characters in code and comments if their UTF16 value is on maximum 2 bytes (https://github.com/dafny-lang/dafny/pull/2717) - feat: Command-line arguments are now available from `Main` in Dafny programs, using `Main(args: seq)` (https://github.com/dafny-lang/dafny/pull/2594) - feat: generate warning when 'old' has no effect (https://github.com/dafny-lang/dafny/pull/2610) - fix: Missing related position in failing precondition (https://github.com/dafny-lang/dafny/pull/2658) diff --git a/docs/DafnyRef/Grammar.md b/docs/DafnyRef/Grammar.md index 69e8e1e9b83..f01d08a6609 100644 --- a/docs/DafnyRef/Grammar.md +++ b/docs/DafnyRef/Grammar.md @@ -89,10 +89,11 @@ let you reconstruct the original grammar. ## 2.1. Dafny Input {#sec-unicode} -Dafny source code files are readable text encoded as UTF-8 Unicode -(because this is what the Coco/R-generated scanner and parser read). +Dafny source code files are readable text encoded as UTF-8 Unicode, where each decoded character has to fit +a single UTF-16 character. This means that every character has to be assignable to a C# 'char', which +has 65536 values. This is what the Coco/R-generated scanner and parser read. All program text other than the contents of comments, character, string and verbatim string literals -consists of printable and white-space ASCII characters, +consists of printable and white-space UTF-8 Unicode characters. that is, ASCII characters in the range `!` to `~`, plus space, tab, cr and nl (ASCII, 9, 10, 13, 32) characters. However, a current limitation of the Coco/R tool used by `dafny` From 1e6278a485297f6be8f5bca50634c7c867a90a3e Mon Sep 17 00:00:00 2001 From: Mikael Mayer Date: Thu, 15 Sep 2022 09:57:40 -0500 Subject: [PATCH 5/8] Added failing tests --- Test/dafny0/PrintUTF8Fails.dfy | 6 ++++++ Test/dafny0/PrintUTF8Fails.dfy.expect | 2 ++ 2 files changed, 8 insertions(+) create mode 100644 Test/dafny0/PrintUTF8Fails.dfy create mode 100644 Test/dafny0/PrintUTF8Fails.dfy.expect diff --git a/Test/dafny0/PrintUTF8Fails.dfy b/Test/dafny0/PrintUTF8Fails.dfy new file mode 100644 index 00000000000..e590386e0c7 --- /dev/null +++ b/Test/dafny0/PrintUTF8Fails.dfy @@ -0,0 +1,6 @@ +// RUN: %dafny_0 /compile:3 "%s" > "%t" +// RUN: %diff "%s.expect" "%t" + +method Main() { + print "Mikaël did not fix 😎\n"; +} diff --git a/Test/dafny0/PrintUTF8Fails.dfy.expect b/Test/dafny0/PrintUTF8Fails.dfy.expect new file mode 100644 index 00000000000..b32db970854 --- /dev/null +++ b/Test/dafny0/PrintUTF8Fails.dfy.expect @@ -0,0 +1,2 @@ +PrintUTF8Fails.dfy(5,8): Error: invalid LogicalExpression +1 parse errors detected in PrintUTF8Fails.dfy From 750f5c6ebe214cb71492ea31a96bc1de96d600ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=C3=ABl=20Mayer?= Date: Fri, 16 Sep 2022 14:54:03 -0500 Subject: [PATCH 6/8] Update docs/DafnyRef/Grammar.md Co-authored-by: Robin Salkeld --- docs/DafnyRef/Grammar.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/DafnyRef/Grammar.md b/docs/DafnyRef/Grammar.md index f01d08a6609..b24fc75f0f6 100644 --- a/docs/DafnyRef/Grammar.md +++ b/docs/DafnyRef/Grammar.md @@ -89,9 +89,7 @@ let you reconstruct the original grammar. ## 2.1. Dafny Input {#sec-unicode} -Dafny source code files are readable text encoded as UTF-8 Unicode, where each decoded character has to fit -a single UTF-16 character. This means that every character has to be assignable to a C# 'char', which -has 65536 values. This is what the Coco/R-generated scanner and parser read. +Dafny source code files are readable text encoded in UTF-8, where each decoded character has to be from the Basic Multilingual Plane and therefore encodable with a single UTF-16 code unit. This is what the Coco/R-generated scanner and parser read. All program text other than the contents of comments, character, string and verbatim string literals consists of printable and white-space UTF-8 Unicode characters. that is, ASCII characters in the range `!` to `~`, plus space, tab, cr and nl (ASCII, 9, 10, 13, 32) characters. From dfedbfed38a923eecd6b2a5c1da526836e6ea4d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=C3=ABl=20Mayer?= Date: Fri, 16 Sep 2022 14:54:25 -0500 Subject: [PATCH 7/8] Update docs/DafnyRef/Grammar.md Co-authored-by: Robin Salkeld --- docs/DafnyRef/Grammar.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/DafnyRef/Grammar.md b/docs/DafnyRef/Grammar.md index b24fc75f0f6..6e09a19fd4f 100644 --- a/docs/DafnyRef/Grammar.md +++ b/docs/DafnyRef/Grammar.md @@ -91,7 +91,7 @@ let you reconstruct the original grammar. Dafny source code files are readable text encoded in UTF-8, where each decoded character has to be from the Basic Multilingual Plane and therefore encodable with a single UTF-16 code unit. This is what the Coco/R-generated scanner and parser read. All program text other than the contents of comments, character, string and verbatim string literals -consists of printable and white-space UTF-8 Unicode characters. +consists of printable and white-space ASCII characters. that is, ASCII characters in the range `!` to `~`, plus space, tab, cr and nl (ASCII, 9, 10, 13, 32) characters. However, a current limitation of the Coco/R tool used by `dafny` From 7c04f6879b97b4ffa837cf49711c5bb17ef27a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=C3=ABl=20Mayer?= Date: Fri, 16 Sep 2022 14:54:33 -0500 Subject: [PATCH 8/8] Update RELEASE_NOTES.md Co-authored-by: Robin Salkeld --- RELEASE_NOTES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index fc6b251887d..3e9549125b7 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,6 @@ # Upcoming -- feat: Support for parsing UTF8 characters in code and comments if their UTF16 value is on maximum 2 bytes (https://github.com/dafny-lang/dafny/pull/2717) +- feat: Support for parsing Basic Multilingual Plane characters from UTF-8 in code and comments (https://github.com/dafny-lang/dafny/pull/2717) - feat: Command-line arguments are now available from `Main` in Dafny programs, using `Main(args: seq)` (https://github.com/dafny-lang/dafny/pull/2594) - feat: generate warning when 'old' has no effect (https://github.com/dafny-lang/dafny/pull/2610) - fix: Missing related position in failing precondition (https://github.com/dafny-lang/dafny/pull/2658)