-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean.py
225 lines (187 loc) · 8.89 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import re
import codecs
# Clean Gadget
# Author https://github.com/johnb110/VDPython:
# For each gadget, replaces all user variables with "VAR#" and user functions with "FUN#"
# Removes content from string and character literals keywords up to C11 and C++17; immutable set
from typing import List
keywords = frozenset({'__asm', '__builtin', '__cdecl', '__declspec', '__except', '__export', '__far16', '__far32',
'__fastcall', '__finally', '__import', '__inline', '__int16', '__int32', '__int64', '__int8',
'__leave', '__optlink', '__packed', '__pascal', '__stdcall', '__system', '__thread', '__try',
'__unaligned', '_asm', '_Builtin', '_Cdecl', '_declspec', '_except', '_Export', '_Far16',
'_Far32', '_Fastcall', '_finally', '_Import', '_inline', '_int16', '_int32', '_int64',
'_int8', '_leave', '_Optlink', '_Packed', '_Pascal', '_stdcall', '_System', '_try', 'alignas',
'alignof', 'and', 'and_eq', 'asm', 'auto', 'bitand', 'bitor', 'bool', 'break', 'case',
'catch', 'char', 'char16_t', 'char32_t', 'class', 'compl', 'const', 'const_cast', 'constexpr',
'continue', 'decltype', 'default', 'delete', 'do', 'double', 'dynamic_cast', 'else', 'enum',
'explicit', 'export', 'extern', 'false', 'final', 'float', 'for', 'friend', 'goto', 'if',
'inline', 'int', 'long', 'mutable', 'namespace', 'new', 'noexcept', 'not', 'not_eq', 'nullptr',
'operator', 'or', 'or_eq', 'override', 'private', 'protected', 'public', 'register',
'reinterpret_cast', 'return', 'short', 'signed', 'sizeof', 'static', 'static_assert',
'static_cast', 'struct', 'switch', 'template', 'this', 'thread_local', 'throw', 'true', 'try',
'typedef', 'typeid', 'typename', 'union', 'unsigned', 'using', 'virtual', 'void', 'volatile',
'wchar_t', 'while', 'xor', 'xor_eq', 'NULL'})
# holds known non-user-defined functions; immutable set
main_set = frozenset({'main'})
# arguments in main function; immutable set
main_args = frozenset({'argc', 'argv'})
operators3 = {'<<=', '>>='}
operators2 = {
'->', '++', '--', '**',
'!~', '<<', '>>', '<=', '>=',
'==', '!=', '&&', '||', '+=',
'-=', '*=', '/=', '%=', '&=', '^=', '|='
}
operators1 = {
'(', ')', '[', ']', '.',
'+', '&',
'%', '<', '>', '^', '|',
'=', ',', '?', ':',
'{', '}', '!', '~'
}
def to_regex(lst):
return r'|'.join([f"({re.escape(el)})" for el in lst])
regex_split_operators = to_regex(operators3) + to_regex(operators2) + to_regex(operators1)
# input is a list of string lines
def clean_gadget(gadget):
# dictionary; map function name to symbol name + number
fun_symbols = {}
# dictionary; map variable name to symbol name + number
var_symbols = {}
fun_count = 1
var_count = 1
# regular expression to find function name candidates
rx_fun = re.compile(r'\b([_A-Za-z]\w*)\b(?=\s*\()')
# regular expression to find variable name candidates
# rx_var = re.compile(r'\b([_A-Za-z]\w*)\b(?!\s*\()')
rx_var = re.compile(r'\b([_A-Za-z]\w*)\b((?!\s*\**\w+))(?!\s*\()')
# final cleaned gadget output to return to interface
cleaned_gadget = []
for line in gadget:
# replace any non-ASCII characters with empty string
ascii_line = re.sub(r'[^\x00-\x7f]', r'', line)
# remove all hexadecimal literals
hex_line = re.sub(r'0[xX][0-9a-fA-F]+', "HEX", ascii_line)
# return, in order, all regex matches at string list; preserves order for semantics
user_fun = rx_fun.findall(hex_line)
user_var = rx_var.findall(hex_line)
# Could easily make a "clean gadget" type class to prevent duplicate functionality
# of creating/comparing symbol names for functions and variables in much the same way.
# The comparison frozenset, symbol dictionaries, and counters would be class scope.
# So would only need to pass a string list and a string literal for symbol names to
# another function.
for fun_name in user_fun:
if len({fun_name}.difference(main_set)) != 0 and len({fun_name}.difference(keywords)) != 0:
# check to see if function name already in dictionary
if fun_name not in fun_symbols.keys():
fun_symbols[fun_name] = 'FUN' + str(fun_count)
fun_count += 1
# ensure that only function name gets replaced (no variable name with same
# identifier); uses positive lookforward
hex_line = re.sub(r'\b(' + fun_name + r')\b(?=\s*\()', fun_symbols[fun_name], hex_line)
for var_name in user_var:
# next line is the nuanced difference between fun_name and var_name
if len({var_name[0]}.difference(keywords)) != 0 and len({var_name[0]}.difference(main_args)) != 0:
# check to see if variable name already in dictionary
if var_name[0] not in var_symbols.keys():
var_symbols[var_name[0]] = 'VAR' + str(var_count)
var_count += 1
# ensure that only variable name gets replaced (no function name with same
# identifier); uses negative lookforward
# print(var_name, gadget, user_var)
hex_line = re.sub(r'\b(' + var_name[0] + r')\b(?:(?=\s*\w+\()|(?!\s*\w+))(?!\s*\()',
var_symbols[var_name[0]], hex_line)
cleaned_gadget.append(hex_line)
# return the list of cleaned lines
return cleaned_gadget
# Cleaner & Tokenizer
# Author https://github.com/hazimhanif/svd-transformer/blob/master/transformer_svd.ipynb
def tokenizer(code, flag=False):
gadget: List[str] = []
tokenized: List[str] = []
# remove all string literals
no_str_lit_line = re.sub(r'["]([^"\\\n]|\\.|\\\n)*["]', '', code)
# remove all character literals
no_char_lit_line = re.sub(r"'.*?'", "", no_str_lit_line)
code = no_char_lit_line
if flag:
code = codecs.getdecoder("unicode_escape")(no_char_lit_line)[0]
for line in code.splitlines():
if line == '':
continue
stripped = line.strip()
# if "\\n\\n" in stripped: print(stripped)
gadget.append(stripped)
clean = clean_gadget(gadget)
for cg in clean:
if cg == '':
continue
# Remove code comments
pat = re.compile(r'(/\*([^*]|(\*+[^*\/]))*\*+\/)|(\/\/.*)')
cg = re.sub(pat, '', cg)
# Remove newlines & tabs
cg = re.sub('(\n)|(\\\\n)|(\\\\)|(\\t)|(\\r)', '', cg)
# Mix split (characters and words)
splitter = r' +|' + regex_split_operators + r'|(\/)|(\;)|(\-)|(\*)'
cg = re.split(splitter, cg)
# Remove None type
cg = list(filter(None, cg))
cg = list(filter(str.strip, cg))
# code = " ".join(code)
# Return list of tokens
tokenized.extend(cg)
return tokenized
if __name__ == '__main__':
test = "((uint32_t *)(&s->boncop))[addr/sizeof(uint32_t)]"
test2 = "(type_code[hw_breakpoint[n].type] << (16 + n*4)) |\n\n ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4))"
asd = re.split(regex_split_operators + r'|(\/)', test)
print(list(filter(None, asd)))
print(tokenizer(test))
print(tokenizer(test2))
test3 = """#include "std_testcase.h"
#ifndef _WIN32
#include <wchar.h>
#endif
#define SRC_STR "0123456789abcdef0123456789abcde"
typedef struct _charVoid
{
char charFirst[16];
void * voidSecond;
void * voidThird;
} charVoid;
static void good1()
{
{
charVoid * structCharVoid = (charVoid *)malloc(sizeof(charVoid));
if (structCharVoid == NULL) {exit(-1);}
structCharVoid->voidSecond = (void *)SRC_STR;
/* Print the initial block pointed to by structCharVoid->voidSecond */
printLine((char *)structCharVoid->voidSecond);
/* FIX: Use the sizeof(structCharVoid->charFirst) to avoid overwriting the pointer y */
memcpy(structCharVoid->charFirst, SRC_STR, sizeof(structCharVoid->charFirst));
structCharVoid->charFirst[(sizeof(structCharVoid->charFirst)/sizeof(char))-1] = '\0'; /* null terminate the string */
printLine((char *)structCharVoid->charFirst);
printLine((char *)structCharVoid->voidSecond);
free(structCharVoid);
}
}
void CWE122_Heap_Based_Buffer_Overflow__char_type_overrun_memcpy_01_good()
{
good1();
}
int main(int argc, char * argv[])
{
srand( (unsigned)time(NULL) );
printLine("Calling good()...");
CWE122_Heap_Based_Buffer_Overflow__char_type_overrun_memcpy_01_good();
printLine("Finished good()");
return 0;
}
"""
print(test3)
x = tokenizer(test3)
for w in x:
if w == '{' or w == '}' or w == ";":
print(w)
else:
print(w, end=' ')