[fix] Make UTF-8 validation work even if utf-8-validate is not installed

Fixes #1868
websockets · Apr 17, 2021 · 23ba6b2 · 23ba6b2
1 parent 114de9e
commit 23ba6b2
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ can use one of the many wrappers available on npm, like
 npm install ws
 ```
 
-### Opt-in for performance and spec compliance
+### Opt-in for performance
 
 There are 2 optional modules that can be installed along side with the ws
 module. These modules are binary addons which improve certain operations.
@@ -67,7 +67,7 @@ necessarily need to have a C++ compiler installed on your machine.
  operations such as masking and unmasking the data payload of the WebSocket
  frames.
 - `npm install --save-optional utf-8-validate`: Allows to efficiently check if a
- message contains valid UTF-8 as required by the spec.
+ message contains valid UTF-8.
 
 ## API docs
 

diff --git a/lib/validation.js b/lib/validation.js
@@ -1,24 +1,13 @@
 'use strict';
 
-try {
- const isValidUTF8 = require('utf-8-validate');
-
- exports.isValidUTF8 =
- typeof isValidUTF8 === 'object'
- ? isValidUTF8.Validation.isValidUTF8 // utf-8-validate@<3.0.0
- : isValidUTF8;
-} catch (e) /* istanbul ignore next */ {
- exports.isValidUTF8 = () => true;
-}
-
 /**
  * Checks if a status code is allowed in a close frame.
  *
  * @param {Number} code The status code
  * @return {Boolean} `true` if the status code is valid, else `false`
  * @public
  */
-exports.isValidStatusCode = (code) => {
+function isValidStatusCode(code) {
  return (
  (code >= 1000 &&
  code <= 1014 &&
@@ -27,4 +16,89 @@ exports.isValidStatusCode = (code) => {
  code !== 1006) ||
  (code >= 3000 && code <= 4999)
  );
-};
+}
+
+/**
+ * Checks if a given buffer contains only correct UTF-8.
+ * Ported from https://www.cl.cam.ac.uk/%7Emgk25/ucs/utf8_check.c by
+ * Markus Kuhn.
+ *
+ * @param {Buffer} buf The buffer to check
+ * @return {Boolean} `true` if `buf` contains only correct UTF-8, else `false`
+ * @public
+ */
+function _isValidUTF8(buf) {
+ const len = buf.length;
+ let i = 0;
+
+ while (i < len) {
+ if (buf[i] < 0x80) {
+ // 0xxxxxxx
+ i++;
+ } else if ((buf[i] & 0xe0) === 0xc0) {
+ // 110xxxxx 10xxxxxx
+ if (
+ i + 1 === len ||
+ (buf[i + 1] & 0xc0) !== 0x80 ||
+ (buf[i] & 0xfe) === 0xc0 // Overlong
+ ) {
+ return false;
+ } else {
+ i += 2;
+ }
+ } else if ((buf[i] & 0xf0) === 0xe0) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ if (
+ i + 2 >= len ||
+ (buf[i + 1] & 0xc0) !== 0x80 ||
+ (buf[i + 2] & 0xc0) !== 0x80 ||
+ (buf[i] === 0xe0 && (buf[i + 1] & 0xe0) === 0x80) || // Overlong
+ (buf[i] === 0xed && (buf[i + 1] & 0xe0) === 0xa0) // Surrogate (U+D800 - U+DFFF)
+ ) {
+ return false;
+ } else {
+ i += 3;
+ }
+ } else if ((buf[i] & 0xf8) === 0xf0) {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ if (
+ i + 3 >= len ||
+ (buf[i + 1] & 0xc0) !== 0x80 ||
+ (buf[i + 2] & 0xc0) !== 0x80 ||
+ (buf[i + 3] & 0xc0) !== 0x80 ||
+ (buf[i] === 0xf0 && (buf[i + 1] & 0xf0) === 0x80) || // Overlong
+ (buf[i] === 0xf4 && buf[i + 1] > 0x8f) ||
+ buf[i] > 0xf4 // > U+10FFFF
+ ) {
+ return false;
+ } else {
+ i += 4;
+ }
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+try {
+ let isValidUTF8 = require('utf-8-validate');
+
+ /* istanbul ignore if */
+ if (typeof isValidUTF8 === 'object') {
+ isValidUTF8 = isValidUTF8.Validation.isValidUTF8; // utf-8-validate@<3.0.0
+ }
+
+ module.exports = {
+ isValidStatusCode,
+ isValidUTF8(buf) {
+ return buf.length < 150 ? _isValidUTF8(buf) : isValidUTF8(buf);
+ }
+ };
+} catch (e) /* istanbul ignore next */ {
+ module.exports = {
+ isValidStatusCode,
+ isValidUTF8: _isValidUTF8
+ };
+}
diff --git a/test/validation.test.js b/test/validation.test.js
@@ -0,0 +1,52 @@
+'use strict';
+
+const assert = require('assert');
+
+const { isValidUTF8 } = require('../lib/validation');
+
+describe('extension', () => {
+ describe('isValidUTF8', () => {
+ it('returns false if it finds invalid bytes', () => {
+ assert.strictEqual(isValidUTF8(Buffer.from([0xf8])), false);
+ });
+
+ it('returns false for overlong encodings', () => {
+ assert.strictEqual(isValidUTF8(Buffer.from([0xc0, 0xa0])), false);
+ assert.strictEqual(isValidUTF8(Buffer.from([0xe0, 0x80, 0xa0])), false);
+ assert.strictEqual(
+ isValidUTF8(Buffer.from([0xf0, 0x80, 0x80, 0xa0])),
+ false
+ );
+ });
+
+ it('returns false for code points in the range U+D800 - U+DFFF', () => {
+ for (let i = 0xa0; i < 0xc0; i++) {
+ for (let j = 0x80; j < 0xc0; j++) {
+ assert.strictEqual(isValidUTF8(Buffer.from([0xed, i, j])), false);
+ }
+ }
+ });
+
+ it('returns false for code points greater than U+10FFFF', () => {
+ assert.strictEqual(
+ isValidUTF8(Buffer.from([0xf4, 0x90, 0x80, 0x80])),
+ false
+ );
+ assert.strictEqual(
+ isValidUTF8(Buffer.from([0xf5, 0x80, 0x80, 0x80])),
+ false
+ );
+ });
+
+ it('returns true for a well-formed UTF-8 byte sequence', () => {
+ // prettier-ignore
+ const buf = Buffer.from([
+ 0xe2, 0x82, 0xAC, // €
+ 0xf0, 0x90, 0x8c, 0x88, // 𐍈
+ 0x24 // $
+ ]);
+
+ assert.strictEqual(isValidUTF8(buf), true);
+ });
+ });
+});