This week i’m trying to upsolve the only unsolved rev challenge from wargames.my. The challenge is straightforward but requires more time to understand what the program does. I’ll explain it as briefly as possible to avoid wasting your time.
Finding Main Function
The binary was stripped, to find the main function we could check entrypoint (usually start function) and ida will directly show us the start function.
__int64 start()
{
unk_14000D1D0 = 0;
return sub_140001180();
}
From the sub_140001180
just find a call to function which has lot of mov operation before call.
.text:00000001400012C8 mov rax, cs:off_140009100
.text:00000001400012CF mov r8, cs:qword_14000D0F8
.text:00000001400012D6 mov ecx, cs:dword_14000D108
.text:00000001400012DC mov rax, [rax]
.text:00000001400012DF mov [rax], r8
.text:00000001400012E2 mov rdx, cs:qword_14000D100
.text:00000001400012E9 call sub_140001C80
So from above instruction we could tell that sub_140001C80
could be the main function. or from the decompiler we could see env initialization.
_initenv = qword_14000D0F8;
result = sub_140001C80((unsigned int)dword_14000D108, qword_14000D100);
What’s in main?
We could confirm the main function by looking at the use of argc, and argv which has int and char** data types. We could also see the flag input coming from argv.
__int64 __fastcall sub_140001C80(int a1, const char **a2)
{
const char *v4; // rbx
unsigned int v5; // eax
const char *v6; // rcx
_BYTE v8[16]; // [rsp+28h] [rbp-58h] BYREF
int v9; // [rsp+38h] [rbp-48h]
int v10; // [rsp+3Ch] [rbp-44h]
int v11; // [rsp+A8h] [rbp+28h]
int v12; // [rsp+C4h] [rbp+44h]
_BYTE v13[56]; // [rsp+C8h] [rbp+48h] BYREF
sub_140002110();
if ( a1 == 2 )
{
v4 = a2[1];
sub_140001540(v8);
sub_140001DC0(v13);
sub_140001E10(v13, 0LL, &unk_140009500, unk_140009808);
v11 = unk_140009810;
v5 = strlen(v4);
sub_140001E10(v13, (unsigned int)(unk_140009808 + 1000), v4, v5);
v9 = unk_140009808 + 1000;
v10 = strlen(v4);
while ( !(unsigned __int8)sub_140001590(v8, v13) )
;
v6 = "Incorrect!";
if ( v12 == 1 )
v6 = "Correct!";
puts(v6);
return 0LL;
}
else
{
sub_140001D70("Usage: %s <flag>\n", *a2);
return 1LL;
}
}
There’s lot of function but most of them are just initialization for VM. The only interesting function is that one used in while loop.
VM implementation
Inside that function we coulde see
__int64 __fastcall sub_140001590(__int64 a1, __int64 a2)
{
unsigned int v4; // eax
int v5; // ecx
int v6; // eax
_BYTE v8[36]; // [rsp+24h] [rbp-24h] BYREF
v4 = sub_140001E80(a2, *(unsigned int *)(a1 + 128));
sub_140001BB0(v8, v4, *(unsigned int *)(a1 + 128));
v5 = *(_DWORD *)(a1 + 128);
v6 = *(_DWORD *)(a1 + 140);
*(_DWORD *)(a1 + 148) = v5;
*(_DWORD *)(a1 + 128) = v5 + 4;
if ( v6 != -1 )
{
*(_DWORD *)(a1 + 128) = v6;
*(_DWORD *)(a1 + 140) = -1;
}
sub_140001610(a1, v8, a2);
return *(unsigned __int8 *)(a1 + 152);
}
This doesn’t really look like a vm, but wait until u see sub_140001610
, it’s full of opcode implementation. To keep track of the execution they use (a1 + 128)
, we could say that a1 is some sort of memory that acting like register. Then sub_140001E80
and sub_140001BB0
is the function that preparing data to execute.
The VM is simple: first, it prepares the data to execute, then tracks the execution, executes the instruction, and loops until (a1 + 152) returns a value. The execution stops only if the executed opcode does something with that register, and it seems like only opcode 0x3c affects it.
case 0x3C:
LODWORD(v6) = *(_DWORD *)(a1 + 16);
if ( (_DWORD)v6 == 1 )
{
*(_BYTE *)(a1 + 152) = 1;
LODWORD(v6) = *(_DWORD *)(a1 + 20);
*(_DWORD *)(a1 + 156) = v6;
}
U could read all the instruction in the function since there’s so many.
Reimplement VM
We can debug the VM execution with a debugger. Since there are no anti-debugging mechanisms, we can easily use IDA or an x64 debugger to trace the execution. However, I feel that this approach is not reliable, as I often lose track of what is happening.
So I decided to reimplement it in python but i don’t code all the opcode instruction since it will be waste of time, what i do is just try to implement the opcode one by one then run it to see which of the opcode used in the program but i haven’t implement it yet and i end up with this script.
import lief
from libnum import n2s,s2n
from ctypes import c_int16, c_uint32, c_uint64
binary = lief.parse('check.exe')
data = binary.get_content_from_virtual_address(0x0007FF66A9F9500-0x7FF66A9F0000,0x304)
X = binary.get_content_from_virtual_address(0x0007FF66A9F92C0-0x7FF66A9F0000,256)
X = [X[i] for i in range(0, len(X),4)]
Y = binary.get_content_from_virtual_address(0x0007FF66A9F93C0-0x7FF66A9F0000,256)
Y = [Y[i] for i in range(0, len(Y), 4)]
tmp = bytearray(0x100000)
tmp[:len(data)] = data
data = tmp
tmp_dat = bytearray(0x100000)
def sign_extend_32_to_64(value):
value &= 0xFFFFFFFF
if value & (1 << 31):
return value - (1 << 32)
return value
def ror_32(value, count):
# value &= 0xFFFFFFFF
return ((value >> count) | ((value & 0xFFFFFFFF) << (64 - count)))
reg = [0]*40
reg[4] = 0x6ec
reg[5] = 0x26
reg[32] = 0x234
reg[140 // 4] = -1
inp = b"FLAG INPUT HERE"
data[0x6ec:len(inp)+0x6ec] = inp
bc = {
0 : 0,
4 : 0,
8 : 0,
12 : 0,
13 : 0,
14 : 0,
16 : 0,
18 : 0,
}
while reg[38] == 0:
a3 = reg[32]
tmp = data[a3:a3+4]
a2 = s2n(tmp[::-1])
if a2 != 0:
v4 = ((a2 & 0xffff0000) >> 16) & 0x1f
v5 = X[a2 >> 26]
if v5 == 0x37:
if v4 == 0x10:
v5 = 0x3a
elif v4 == 0x11:
v5 = 0x3b
else:
v5 = (((a2 & 0xffff0000) >> 16) & 1) | 0x38
elif v5 == 0:
v5 = Y[a2 & 0x3f]
bc[0] = a3
bc[4] = v5
bc[8] = a2
bc[12] = (a2 >> 11) & 0x1F
bc[13] = v4
bc[14] = (a2 >> 21) & 0x1F
bc[16] = a2
bc[18] = (a2 >> 6) & 0x1F
else:
bc[0] = a3
bc[4] = 2
reg[32] += 4
reg[148 // 4] = a3
jmp = reg[140 // 4]
if jmp != -1:
reg[32] = jmp
reg[140 // 4] = -1
opcode = bc[4]
v5 = reg[(bc[14] & 0xff)]
v6 = v5
v7 = bc[13]
v8 = reg[v7]
v9 = v8
match opcode:
case 0x17:
v6 = bc[16] & 0xffff
reg[v7] = (v6 + v5)
print(f" ({hex(opcode)}) reg[{v7}] = {hex(reg[v7])} | {hex(v6)} + {hex(v5)}")
case 0x34:
print(f" ({hex(opcode)}) {hex(v5)} == {hex(v8)} | {v5 == v8}")
if v5 == v8:
print("\tcontinue")
else:
reg[140 // 4] = reg[128 // 4] + 4 * (bc[16] & 0xffff) & 0xff
print(f"jump reg[{140 // 4}] = {hex(reg[140//4])}")
case 4:
reg[v7] = data[((bc[16] & 0xffff) + v5) & 0xffff] & 0xff
print(f' ({hex(opcode)}) reg[{v7}] = {hex(reg[v7])} | {hex((bc[16] & 0xffff) + v5 - 0x6ec)} | {chr(reg[v7])}')
case 0x2:
print(f" ({hex(opcode)}) nop")
case 0xc:
v5 = c_uint32(c_int16(v5).value).value
idx = v5 + (bc[16] & 0xffff) & 0xffff
if not idx < 0xFFEFFFFF:
tmp_dat[idx - 0xFFEFFFFF] = v8
print(f" ({hex(opcode)}) tmp_dat[{idx}] = {hex(v8)}")
else:
data[idx] = v8 & 0xff
print(f" ({hex(opcode)}) data[{idx}] = {hex(v8)}")
case 0xa:
v5 = c_uint32(c_int16(v5).value).value
idx = v5 + (bc[16] & 0xffff) & 0xffff
if not idx < 0xFFEFFFFF:
tmp_dat[idx - 0xFFEFFFFF] = v8
print(f" ({hex(opcode)}) tmp_dat[{hex(idx)}] = {hex(v8)}")
else:
data[idx] = v8 & 0xff
print(f" ({hex(opcode)}) data[{hex(idx)}] = {hex(v8)}")
case 0x30:
v10 = reg[32]
reg[140 // 4] = (v10 & 0xF0000000) + 4 * (bc[8] & 0x3FFFFFF)
print(f" (0x30) reg[{140 // 4}] = {reg[140 // 4]}")
v6 = v10 + 4
reg[124 // 4] = v6
print(f" (0x30) reg[{124 // 4}] = {hex(v6)}")
case 0x1a:
v6 = bc[12]
reg[v6] = v5 | v8
print(f" (0x1a) reg[{v6}] = {hex(reg[v6])} | ( {hex(v5)} | {hex(v8)})")
case 0x1b:
v6 = bc[12]
reg[v6] = v5 ^ v8
print(f" (0x1b) reg[{v6}] = {hex(reg[v6])} | ( {hex(v5)} ^ {hex(v8)})")
case 0x1e:
v6 = bc[16] & 0xffff
reg[v7] = v6 | v5
print(f" (0x1a) reg[{v7}] = {hex(reg[v7])} | ( {hex(v6)} | {hex(v5)})")
case 0x10:
v6 = bc[12]
reg[v6] = (v5 & 0xffffffff) < (v8 & 0xffffffff)
print(f" (0x10) reg[{v6}] = {reg[v6]} | {hex(v5 & 0xffffffff)} < {hex(v8)}")
case 0x12:
v6 = bc[16] & 0xffff
v5 &= 0xff
reg[v7] = v5 < v6
print(f" (0x12) reg[{v7}] = {reg[v7]} | {hex(v5)} < {hex(v6)}")
case 0x13:
v6 = bc[12]
reg[v6] = v9 + v5
print(f" (0x13) reg[{v6}] = {hex(reg[v6])} | {hex(v9)} + {hex(v5)}")
case 0x15:
v6 = bc[12]
reg[v6] = c_uint32(v5 - v8).value
print(f" (0x15) reg[{v6}] = {hex(reg[v6])} | {hex(v5)} - {hex(v8)}")
case 0x31:
reg[140 // 4] = v5
print(f" (0x31) jump reg[{140 // 4}] = {hex(reg[140//4])}")
case 0x33:
# print([hex(bc[i]) for i in bc])
print(f" ({hex(opcode)}) {hex(v5)} != {hex(v8)} | {v5 != v8}")
if v5 != v8:
print("\tcontinue")
else:
reg[140 // 4] = (reg[128 // 4] + 4 * (bc[16] & 0xffff)) & 0xff
print(f"jump reg[{140 // 4}] = {hex(reg[140//4])}")
case 7:
idx = (bc[16] & 0xffff) + v5
reg[v7] = s2n(data[idx:idx+4][::-1])
print(f' ({hex(opcode)}) reg[{v7}] = {hex(reg[v7])} | {hex((bc[16] & 0xffff) + v5)}')
case 0x23:
v6 = bc[18]
reg[bc[12]] = (v8 << v6) & 0xffffffff
print(f' (0x23) reg[{bc[12]}] = {hex(reg[bc[12]])} | {hex(v8)} << {hex(v6)}')
case 0x26:
reg[v7] = (bc[16] & 0xffff) << 16
print(f' (0x26) reg[{v7}] = {hex(reg[v7])} | {hex(bc[16] & 0xffff)} << 16')
case 0x27:
v8 = sign_extend_32_to_64(v8)
v8 = c_uint64(v8).value
res = ror_32((v5 * v8) & 0xffffffffffffffff, 32)
reg[132 // 4] = res & 0xffffffff
reg[136 // 4] = (res >> 32) & 0xffffffff
print(f' (0x27) reg[{132 // 4}] = {hex(reg[132 // 4])} | ror({hex(v5)} * {hex(v8)}, 32)')
print(f' (0x27) reg[{136 // 4}] = {hex(reg[136 // 4])} | ror({hex(v5)} * {hex(v8)}, 32)')
case 0x2d:
reg[bc[12]] = reg[136 // 4]
print(f" (0x2d) reg[{bc[12]}] = {hex(reg[bc[12]])} | reg[{136 // 4}] | {reg[136 // 4]}")
case 0x2a:
if v8:
reg[136 // 4] = v5 // v9
reg[132 // 4] = v5 % v8
print(f" (0x2a) reg[{136//4}] = {hex(reg[136//4])} | {hex(v5)} // {hex(v8)}")
print(f" (0x2a) reg[{132//4}] = {hex(reg[132//4])} | {hex(v5)} % {hex(v8)}")
case 0x3c:
v6 = reg[16 // 4]
if v6 == 1:
reg[152 // 4] = 1
v6 = reg[20 // 4]
reg[156 // 4] = v6
print(f" (0x3c) reg[{152 // 4}] = 1; reg[{156 // 4}] = {hex(v6)}")
else:
print(v6)
break
case _:
print([hex(bc[i]) for i in bc])
break
print(f"CHECKERRRR {hex(reg[7])}")
print([hex(i) for i in reg])
It’s a bit messy I don’t have time to clean it or make a better one.
Understanding Flag Checker
Once we got our vm then we could analyze the output.
First it checks the length of our input, just play with the input and see which value changed (that’s what i do).
(0x17) reg[3] = 0x26 | 0x26 + 0x0
(0x34) 0x26 == 0x26 | True
And it check flag length with 0x26, then it perform check with some chars at some index.
(0x17) reg[2] = 0x3 | 0x3 + 0x0
(0x4) reg[5] = 0x77 | 0x0 | w
(0x17) reg[3] = 0x77 | 0x77 + 0x0
(0x34) 0x77 == 0x77 | True
continue
(0x17) reg[2] = 0x4 | 0x4 + 0x0
(0x4) reg[5] = 0x67 | 0x1 | g
(0x17) reg[3] = 0x67 | 0x67 + 0x0
(0x34) 0x67 == 0x67 | True
continue
(0x17) reg[6] = 0x1 | 0x1 + 0x0
(0x4) reg[5] = 0x6d | 0x2 | m
(0x17) reg[3] = 0x6d | 0x6d + 0x0
(0x34) 0x6d == 0x6d | True
continue
(0x2) nop
(0x4) reg[5] = 0x79 | 0x3 | y
(0x17) reg[3] = 0x79 | 0x79 + 0x0
(0x34) 0x79 == 0x79 | True
continue
(0x2) nop
(0x4) reg[5] = 0x7b | 0x4 | {
(0x17) reg[3] = 0x7b | 0x7b + 0x0
(0x34) 0x7b == 0x7b | True
continue
(0x2) nop
(0x4) reg[3] = 0x7d | 0x25 | }
(0x17) reg[2] = 0x7d | 0x7d + 0x0
(0x34) 0x7d == 0x7d | True
continue
Note that this output is the output with the correct flag input so all the check will be true, when u try the vm urself u might get different value, but the thing is we has to make it true so we could move further because once we got the wrong jump it means the program ends.
The actual flag check (value inside the flag format) was perform in 7 parts.
- first 4 chars
- second 4 chars
- third 4 chars
- fourth 4 chars
- first 16 chars
- first 8 chars of the second 16 chars
- second 8 chars of the second 16 chars
There’s also a check for each char has to be [0-9a-f] but i don’t really understand how it perform the check i guess it was this.
(0x17) reg[3] = 0x6f1 | 0x5 + 0x6ec
(0x17) reg[7] = 0x70c | 0x20 + 0x6ec
(0x1a) reg[5] = 0x6f1 | ( 0x6f1 | 0x0)
(0x4) reg[2] = 0x33 | 0x5 | 3
(0x17) reg[5] = 0x6f2 | 0x1 + 0x6f1
(0x17) reg[6] = 0x10003 | 0xffd0 + 0x33
(0x12) reg[6] = True | 0x3 < 0xa
(0x17) reg[2] = 0xffd2 | 0xff9f + 0x33
(0x34) 0x1 == 0x0 | False
All the operation was performed first then it will check those parts.
Back to the actual flag check, first 4 chars was multiply it check our flag * something equal to something, and remeber this is 32 bit program so it just perform the last 32 bits result, this check easily bruteforced
bf = '0123456789abcdef'
end = False
for i in bf:
for j in bf:
for k in bf:
for l in bf:
tmp = s2n(i + j + k +l)
if tmp * 0xe60e31d7 & 0xffffffff == 0x2f533dd5:
print((i + j + k + l)[::-1])
Same as the third 4 chars it just multiply flag with something, could use the same script but changed the value, this bruteforce script will return only 1 possible combination for both first and third 4 chars.
For the second 4 chars we could also do bruteforce but there’s lot of possibility because it’s doing division and addition.
bf = "0123456789abcdef"
end = False
for i in bf:
for j in bf:
for k in bf:
for l in bf:
tmp = s2n(i + j + k +l)
tmp = tmp // 0x1a6b0
tmp += 0x76741f54
if tmp * 0x26fec866 & 0xffffffff == 0xfe37c998 :
print((i + j + k + l)[::-1])
This result so many but the last char is always same —> [0-9a-f][0-9a-f][45]d
(not good at regex).
The fourth part of the flag also return in many possible combination.
bf = '0123456789abcdef'
end = False
for i in bf:
for j in bf:
for k in bf:
for l in bf:
tmp = s2n(i + j + k +l)
if tmp + 0x9d9d0b86 & 0xffffffff < 0x881b :
print((i + j + k + l)[::-1])
The regex is [0-9a-f][0-9a-f]cb
.
The fifth part is the first 16 chars so it’s first, second, third, and fourth part combined, if u see the flatten flow (my vm output), this for example.
(0x15) reg[2] = 0xc32be44f | 0x12881c24 - 0x4f5c37d5
(0x4) reg[5] = 0x31 | 0xe | 1
(0x17) reg[3] = 0x6fb | 0x1 + 0x6fa
(0x1b) reg[5] = 0xc32be47e | ( 0x31 ^ 0xc32be44f)
(0x23) reg[2] = 0xf23f0000 | 0xc32be47e << 0xf
(0x13) reg[2] = 0x1b56ae47e | 0xc32be47e + 0xf23f0000
(0x23) reg[2] = 0xd5ab91f8 | 0x1b56ae47e << 0x2
(0x15) reg[2] = 0x127fad7a | 0xd5ab91f8 - 0xc32be47e
(0x23) reg[2] = 0x93fd6bd0 | 0x127fad7a << 0x3
(0x13) reg[2] = 0x15729504e | 0xc32be47e + 0x93fd6bd0
(0x23) reg[2] = 0x5ca54138 | 0x15729504e << 0x2
(0x13) reg[2] = 0x11fd125b6 | 0xc32be47e + 0x5ca54138
(0x23) reg[2] = 0x7f4496d8 | 0x11fd125b6 << 0x2
It hard to determine what is the algorithm, but if u find the first number used and search it on google 0x811c9dc5
, this is FNV1a algorithm (hashing algorithm), we could check it with compiling this simple c program then compared with our result.
#include <stdio.h>
#include <stdint.h>
uint32_t hash_32_fnv1a(const void* key, const uint32_t len) {
const char* data = (char*)key;
uint32_t hash = 0x811c9dc5;
uint32_t prime = 0x1000193;
for(int i = 0; i < len; ++i) {
uint8_t value = data[i];
hash = hash ^ value;
hash *= prime;
}
return hash;
}
int main() {
const char* data = "0101070009070103"; // input
uint32_t hash = hash_32_fnv1a(data, 16);
printf("Hash: %x\n", hash);
return 0;
}
FNV1a is not a cryptography secure hash algorithm there’s another ctf challenge that uses that fact and could reverse the output back to input, but it was for the 64bit version and i’m not crypto person when i try use the poc with this challenge which is using 32 bit version i don’t get the correct input back and idk why (already changed some stuff and still doesn’t work), so we’ll back to bruteforcing since we know most of the char it will be fast.
fofiv = ['4','5']
bf = 'abcdef0123456789'
end = False
for i in bf:
for j in bf:
for k in bf:
for l in bf:
for x in fofiv:
tmp = f'3011{i}{j}{x}de101{k}{l}cb'.encode()
hash = 0x811c9dc5
for fck in tmp:
hash = fck ^ hash
hash = c_uint32(hash * 0x1000193).value
# print(tmp, hex(hash))
if hash == 0x8a3b532e:
print(tmp)
end = True
break
# init = 0x811c9dc5
# last = 0
# for fck in tmp:
# f0 = c_uint32(init - last).value
# f1 = fck ^ f0
# f2 = c_uint32(f1 << 0xf).value
# f3 = c_uint32(f2 + f1).value
# f4 = c_uint32(f3 << 0x2).value
# f5 = c_uint32(f4 - f1).value
# f6 = c_uint32(f5 << 0x3).value
# f7 = c_uint32(f6 + f1).value
# f8 = c_uint32(f7 << 0x2).value
# f9 = c_uint32(f8 + f1).value
# fa = c_uint32(f9 << 0x2).value
# last = f1
# init = fa
# if c_uint32(init - last).value == 0x8a3b532e:
# print(tmp)
# end = True
# break
if end:
break
if end:
break
if end:
break
if end:
break
After that we successfully recovered the first 16 chars, what about the second half?, turns out it was easier because it just z3 things they basically do this and .
So i just use this z3 script to retrieve the correct value back.
from z3 import *
from libnum import s2n, n2s
b1 = BitVecVal(s2n("1103"), 32)
b2 = BitVecVal(s2n("d54a"), 32)
t1 = BitVecVal(0xa45aa159,32)
t2 = BitVecVal(0xffae00a9,32)
c1 = BitVecVal(s2n("101e"), 32)
c2 = BitVecVal(s2n("bc42"), 32)
tt1 = BitVecVal(0x57a20d60,32)
tt2 = BitVecVal(0x52fffcb6, 32)
solver = Solver()
X = BitVec('X', 32)
Y = BitVec('Y', 32)
A = BitVec('A', 32)
B = BitVec('B', 32)
P = b1 ^ X
Q = b2 ^ Y
solver.add(P + Q == t1)
solver.add(P - Q == t2)
K = c1 ^ A
L = c2 ^ B
solver.add(K + L == tt1)
solver.add(L - K == tt2)
for i in range(4):
Xi = (X >> (8 * i)) & 0xFF
Yi = (Y >> (8 * i)) & 0xFF
Ai = (A >> (8 * i)) & 0xFF
Bi = (B >> (8 * i)) & 0xFF
solver.add(Or(And(Xi >= ord('0'), Xi <= ord('9')),
And(Xi >= ord('a'), Xi <= ord('f'))))
solver.add(Or(And(Yi >= ord('0'), Yi <= ord('9')),
And(Yi >= ord('a'), Yi <= ord('f'))))
solver.add(Or(And(Ai >= ord('0'), Ai <= ord('9')),
And(Ai >= ord('a'), Ai <= ord('f'))))
solver.add(Or(And(Bi >= ord('0'), Bi <= ord('9')),
And(Bi >= ord('a'), Bi <= ord('f'))))
if solver.check() == sat:
model = solver.model()
X_value = n2s(model[X].as_long())[::-1]
Y_value = n2s(model[Y].as_long())[::-1]
print(f"X = {X_value}, Y = {Y_value}")
A_value = n2s(model[A].as_long())[::-1]
B_value = n2s(model[B].as_long())[::-1]
print(f"A = {A_value}, B = {B_value}")
else:
print("No solution found.")
The final flag is : wgmy{3011a45de10124cb2a5c9dc609a39127}