Skip to content

Commit

Permalink
Support UTF-8 surrogates for UTF-16 and 32.
Browse files Browse the repository at this point in the history
  • Loading branch information
EdwardCooke committed Aug 28, 2023
1 parent a6845eb commit e8d7113
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 17 deletions.
11 changes: 2 additions & 9 deletions YamlDotNet.Benchmark/Program.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
using System.Globalization;
using BenchmarkDotNet.Running;
using BenchmarkDotNet.Running;
using YamlDotNet.Benchmark;
using YamlDotNet.Core;
using YamlDotNet.Core.Events;
using YamlDotNet.Serialization;
using YamlDotNet.Serialization.NamingConventions;

var dateTimeOffset = new DateTimeOffset(new DateTime(2017, 1, 2, 3, 4, 5), new TimeSpan(-6, 0, 0));
Console.WriteLine(dateTimeOffset.ToString("MM/dd/yyyy HH:mm:ss zzz", CultureInfo.InvariantCulture));
Console.WriteLine(dateTimeOffset.ToString("O", CultureInfo.InvariantCulture));
BenchmarkSwitcher.FromAssembly(typeof(YamlStreamBenchmark).Assembly).Run(args);
16 changes: 15 additions & 1 deletion YamlDotNet.Test/Core/ScannerTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// This file is part of YamlDotNet - A .NET library for YAML.
// This file is part of YamlDotNet - A .NET library for YAML.
// Copyright (c) Antoine Aubry and contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of
Expand Down Expand Up @@ -530,6 +530,20 @@ public void Keys_can_start_with_colons_after_double_quoted_values_in_nested_bloc
StreamEnd);
}

[Fact]
public void Utf16StringsAsUtf8SurrogatesWorkCorrectly()
{
AssertSequenceOfTokensFrom(Yaml.ScannerForText("Test: \"\\uD83D\\uDC4D\""),
StreamStart,
BlockMappingStart,
Key,
PlainScalar("Test"),
Value,
DoubleQuotedScalar("\uD83D\uDC4D"), // guaranteed thumbs up emoticon that will work in Windows Terminal since it pukes on displaying it.
BlockEnd,
StreamEnd);
}

private void AssertPartialSequenceOfTokensFrom(Scanner scanner, params Token[] tokens)
{
var tokenNumber = 1;
Expand Down
63 changes: 56 additions & 7 deletions YamlDotNet/Core/Scanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1937,19 +1937,68 @@ private Scalar ScanFlowScalar(bool isSingleQuoted)

// Check the value and write the character.

if ((character >= 0xD800 && character <= 0xDFFF) || character > 0x10FFFF)
//check for utf-8 surrogate pair
if (character >= 0xD800 && character <= 0xDFFF)
{
for (var k = 0; k < codeLength; ++k)
{
Skip();
}

if (analyzer.Peek(0) == '\\' &&
(analyzer.Peek(1) == 'u' || analyzer.Peek(1) == 'U'))
{
Skip(); //escape character
if (analyzer.Peek(0) == 'u')
{
codeLength = 4;
}
else
{
codeLength = 8;
}
Skip(); //escape code

var lowSurrogate = 0;

// Scan the character value.
for (var k = 0; k < codeLength; ++k)
{
if (!analyzer.IsHex(0))
{
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, did not find expected hexadecimal number.");
}
lowSurrogate = ((lowSurrogate << 4) + analyzer.AsHex(k));
}

for (var k = 0; k < codeLength; ++k)
{
Skip();
}

character = char.ConvertToUtf32((char)character, (char)lowSurrogate);
}
else
{
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, found invalid Unicode surrogates.");
}
}
else if (character > 0x10FFFF)
{
throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, found invalid Unicode character escape code.");
}
else
{
// Advance the pointer.

value.Append(char.ConvertFromUtf32(character));

// Advance the pointer.
for (var k = 0; k < codeLength; ++k)
{
Skip();
}

for (var k = 0; k < codeLength; ++k)
{
Skip();
}

value.Append(char.ConvertFromUtf32(character));
}
}
else
Expand Down

0 comments on commit e8d7113

Please sign in to comment.