-
Notifications
You must be signed in to change notification settings - Fork 2.4k
/
utf8_mosq.c
111 lines (97 loc) · 2.97 KB
/
utf8_mosq.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/*
Copyright (c) 2016-2020 Roger Light <[email protected]>
All rights reserved. This program and the accompanying materials
are made available under the terms of the Eclipse Public License 2.0
and Eclipse Distribution License v1.0 which accompany this distribution.
The Eclipse Public License is available at
https://www.eclipse.org/legal/epl-2.0/
and the Eclipse Distribution License is available at
https://www.eclipse.org/org/documents/edl-v10.php.
SPDX-License-Identifier: EPL-2.0 OR BSD-3-Clause
Contributors:
Roger Light - initial implementation.
*/
#include "config.h"
#include <stdio.h>
#include "mosquitto.h"
int mosquitto_validate_utf8(const char *str, int len)
{
int i;
int j;
int codelen;
int codepoint;
const unsigned char *ustr = (const unsigned char *)str;
if(!str) return MOSQ_ERR_INVAL;
if(len < 0 || len > 65536) return MOSQ_ERR_INVAL;
for(i=0; i<len; i++){
if(ustr[i] == 0){
return MOSQ_ERR_MALFORMED_UTF8;
}else if(ustr[i] <= 0x7f){
codelen = 1;
codepoint = ustr[i];
}else if((ustr[i] & 0xE0) == 0xC0){
/* 110xxxxx - 2 byte sequence */
if(ustr[i] == 0xC0 || ustr[i] == 0xC1){
/* Invalid bytes */
return MOSQ_ERR_MALFORMED_UTF8;
}
codelen = 2;
codepoint = (ustr[i] & 0x1F);
}else if((ustr[i] & 0xF0) == 0xE0){
/* 1110xxxx - 3 byte sequence */
codelen = 3;
codepoint = (ustr[i] & 0x0F);
}else if((ustr[i] & 0xF8) == 0xF0){
/* 11110xxx - 4 byte sequence */
if(ustr[i] > 0xF4){
/* Invalid, this would produce values > 0x10FFFF. */
return MOSQ_ERR_MALFORMED_UTF8;
}
codelen = 4;
codepoint = (ustr[i] & 0x07);
}else{
/* Unexpected continuation byte. */
return MOSQ_ERR_MALFORMED_UTF8;
}
/* Reconstruct full code point */
if(i == len-codelen+1){
/* Not enough data */
return MOSQ_ERR_MALFORMED_UTF8;
}
for(j=0; j<codelen-1; j++){
if((ustr[++i] & 0xC0) != 0x80){
/* Not a continuation byte */
return MOSQ_ERR_MALFORMED_UTF8;
}
codepoint = (codepoint<<6) | (ustr[i] & 0x3F);
}
/* Check for UTF-16 high/low surrogates */
if(codepoint >= 0xD800 && codepoint <= 0xDFFF){
return MOSQ_ERR_MALFORMED_UTF8;
}
/* Check for overlong or out of range encodings */
/* Checking codelen == 2 isn't necessary here, because it is already
* covered above in the C0 and C1 checks.
* if(codelen == 2 && codepoint < 0x0080){
* return MOSQ_ERR_MALFORMED_UTF8;
* }else
*/
if(codelen == 3 && codepoint < 0x0800){
return MOSQ_ERR_MALFORMED_UTF8;
}else if(codelen == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)){
return MOSQ_ERR_MALFORMED_UTF8;
}
/* Check for non-characters */
if(codepoint >= 0xFDD0 && codepoint <= 0xFDEF){
return MOSQ_ERR_MALFORMED_UTF8;
}
if((codepoint & 0xFFFF) == 0xFFFE || (codepoint & 0xFFFF) == 0xFFFF){
return MOSQ_ERR_MALFORMED_UTF8;
}
/* Check for control characters */
if(codepoint <= 0x001F || (codepoint >= 0x007F && codepoint <= 0x009F)){
return MOSQ_ERR_MALFORMED_UTF8;
}
}
return MOSQ_ERR_SUCCESS;
}