Skip to content

Commit

Permalink
Implement node.js buffer.isAscii|isUtf8
Browse files Browse the repository at this point in the history
This change adds two new methods to the node.js Buffer class:
isAscii and isUtf8. These methods are used to check if the buffer
contains only ASCII or UTF-8 characters, respectively.

The implementation of these methods is based on the simdutf library,
just like in Node.js itself.
  • Loading branch information
jasnell committed Jun 18, 2024
1 parent 2dca6bb commit c2648e0
Show file tree
Hide file tree
Showing 10 changed files with 154 additions and 0 deletions.
10 changes: 10 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ http_archive(
url = "https://github.com/ada-url/ada/releases/download/v2.8.0/singleheader.zip",
)

http_archive(
name = "simdutf",
build_file = "//:build/BUILD.simdutf",
patch_args = ["-p1"],
patches = [],
sha256 = "7867c118a11bb7ccaea0f999a28684b06040027506b424b706146cc912b80ff6",
type = "zip",
url = "https://github.com/simdutf/simdutf/releases/download/v5.2.8/singleheader.zip",
)

http_archive(
name = "pyodide",
build_file = "//:build/BUILD.pyodide",
Expand Down
7 changes: 7 additions & 0 deletions build/BUILD.simdutf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cc_library(
name = "simdutf",
srcs = ["simdutf.cpp"],
hdrs = ["simdutf.h"],
visibility = ["//visibility:public"],
copts = ["-w"],
)
2 changes: 2 additions & 0 deletions compile_flags.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
-nostdinc
-Ibazel-bin/external/dawn/include
-Ibazel-bin/external/ada-url/_virtual_includes/ada-url/
-Ibazek-bin/external/simdutf/virtual_includes/simdutf/
-Ibazel-bin/external/com_cloudflare_lol_html/_virtual_includes/lolhtml
-Iexternal/perfetto-sdk/sdk/
-Iexternal/ada-url/
-Iexternal/simdutf/
-Iexternal/com_google_benchmark/include/
-Iexternal/dawn/include
-Iexternal/ssl/src/include
Expand Down
6 changes: 6 additions & 0 deletions src/node/buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import {
kStringMaxLength,
Buffer,
SlowBuffer,
isAscii,
isUtf8,
} from 'node-internal:internal_buffer';

// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
Expand All @@ -26,6 +28,8 @@ export {
Blob,
Buffer,
SlowBuffer,
isAscii,
isUtf8,
};

export default {
Expand All @@ -40,4 +44,6 @@ export default {
Blob,
Buffer,
SlowBuffer,
isAscii,
isUtf8,
};
2 changes: 2 additions & 0 deletions src/node/internal/buffer.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,5 @@ export function write(buffer: Uint8Array,
encoding: string): void;
export function decode(buffer: Uint8Array, state: Uint8Array): string;
export function flush(state: Uint8Array): string;
export function isAscii(value: ArrayBufferView): boolean;
export function isUtf8(value: ArrayBufferView): boolean;
16 changes: 16 additions & 0 deletions src/node/internal/internal_buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2280,10 +2280,26 @@ function writeU_Int24LE(
return offset;
}

export function isAscii(value: ArrayBufferView) {
if ((value as any)?.detached || (value as any)?.buffer?.detached) {
throw new Error('Unable to determine if buffer is ASCII when it is detached');
}
return bufferUtil.isAscii(value);
}

export function isUtf8(value: ArrayBufferView) {
if ((value as any)?.detached || (value as any)?.buffer?.detached) {
throw new Error('Unable to determine if buffer is UTF8 when it is detached');
}
return bufferUtil.isUtf8(value);
}

export default {
Buffer,
constants,
kMaxLength,
kStringMaxLength,
SlowBuffer,
isAscii,
isUtf8,
};
95 changes: 95 additions & 0 deletions src/workerd/api/node/buffer-nodejs-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ import {
kMaxLength,
kStringMaxLength,
constants,
isAscii,
isUtf8,
} from 'node:buffer';

import * as buffer from 'node:buffer';
Expand Down Expand Up @@ -5610,3 +5612,96 @@ export const inspect = {
);
}
};

export const isAsciiTest = {
test(ctrl, env, ctx) {
const encoder = new TextEncoder();
strictEqual(isAscii(encoder.encode('hello')), true);
strictEqual(isAscii(encoder.encode('ğ')), false);
strictEqual(isAscii(Buffer.from([])), true);

[
undefined,
'', 'hello',
false, true,
0, 1,
0n, 1n,
Symbol(),
() => {},
{}, [], null,
].forEach((input) => {
throws(
() => isAscii(input),
);
});
}
};

export const isUtf8Test = {
test(ctrl, env, ctx) {
const encoder = new TextEncoder();

strictEqual(isUtf8(encoder.encode('hello')), true);
strictEqual(isUtf8(encoder.encode('ğ')), true);
strictEqual(isUtf8(Buffer.from([])), true);

// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
[
[0xFF], // 'invalid code'
[0xC0], // 'ends early'
[0xE0], // 'ends early 2'
[0xC0, 0x00], // 'invalid trail'
[0xC0, 0xC0], // 'invalid trail 2'
[0xE0, 0x00], // 'invalid trail 3'
[0xE0, 0xC0], // 'invalid trail 4'
[0xE0, 0x80, 0x00], // 'invalid trail 5'
[0xE0, 0x80, 0xC0], // 'invalid trail 6'
[0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
[0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'

// Overlong encodings
[0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
[0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
[0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
[0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'

[0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
[0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
[0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
[0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'

[0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
[0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
[0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'

[0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
[0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
[0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'

[0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
[0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'

// UTF-16 surrogates encoded as code points in UTF-8
[0xED, 0xA0, 0x80], // 'lead surrogate'
[0xED, 0xB0, 0x80], // 'trail surrogate'
[0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
].forEach((input) => {
strictEqual(isUtf8(Buffer.from(input)), false);
});

[
null,
undefined,
'hello',
true,
false,
].forEach((input) => {
throws(
() => isUtf8(input),
);
});
}
};
11 changes: 11 additions & 0 deletions src/workerd/api/node/buffer.c++
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <workerd/jsg/buffersource.h>
#include <kj/encoding.h>
#include <algorithm>
#include "simdutf.h"

// These are defined by <sys/byteorder.h> or <netinet/in.h> on some systems.
// To avoid warnings, undefine them before redefining them.
Expand Down Expand Up @@ -862,5 +863,15 @@ jsg::JsString BufferUtil::flush(jsg::Lock& js, kj::Array<kj::byte> state) {
return ret;
}

bool BufferUtil::isAscii(kj::Array<kj::byte> buffer) {
if (buffer.size() == 0) return true;
return simdutf::validate_ascii(buffer.asChars().begin(), buffer.size());
}

bool BufferUtil::isUtf8(kj::Array<kj::byte> buffer) {
if (buffer.size() == 0) return true;
return simdutf::validate_utf8(buffer.asChars().begin(), buffer.size());
}

} // namespace workerd::api::node {

4 changes: 4 additions & 0 deletions src/workerd/api/node/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ class BufferUtil final: public jsg::Object {
kj::Array<kj::byte> bytes,
kj::Array<kj::byte> state);
jsg::JsString flush(jsg::Lock& js, kj::Array<kj::byte> state);
bool isAscii(kj::Array<kj::byte> bytes);
bool isUtf8(kj::Array<kj::byte> bytes);

JSG_RESOURCE_TYPE(BufferUtil) {
JSG_METHOD(byteLength);
Expand All @@ -88,6 +90,8 @@ class BufferUtil final: public jsg::Object {
JSG_METHOD(swap);
JSG_METHOD(toString);
JSG_METHOD(write);
JSG_METHOD(isAscii);
JSG_METHOD(isUtf8);

// For StringDecoder
JSG_METHOD(decode);
Expand Down
1 change: 1 addition & 0 deletions src/workerd/io/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ wd_cc_library(
implementation_deps = [
"@capnp-cpp//src/kj/compat:kj-brotli",
"@capnp-cpp//src/kj/compat:kj-gzip",
"@simdutf",
],
visibility = ["//visibility:public"],
deps = [
Expand Down

0 comments on commit c2648e0

Please sign in to comment.