Skip to content

Instantly share code, notes, and snippets.

@UserUnknownFactor
Last active August 25, 2025 15:43
Show Gist options
  • Save UserUnknownFactor/9826e3c686c29a07bf4ad8fc4ba8b973 to your computer and use it in GitHub Desktop.
Save UserUnknownFactor/9826e3c686c29a07bf4ad8fc4ba8b973 to your computer and use it in GitHub Desktop.
Python tool to translate x86 exe strings by replacing their references to a newly created .trans PE section
import struct
import pefile
import re
import capstone
from tqdm import tqdm
CACHE_DIR = "__pycache__"
cache = None
ENABLE_CACHE = True
try:
from diskcache import Cache
cache = Cache(CACHE_DIR)
except ImportError:
ENABLE_CACHE = False
from filetranslate.service_fn import read_csv_list, write_csv_list
def patch_exe_with_translations(exe_path, csv_path, output_path, allowed_sections=[b'.text']):
# Read translations from CSV
translations = read_csv_list(csv_path)
for row in translations:
if len(row) >= 3:
offset_str = row[2]
if offset_str:
row[2] = int(offset_str, 16) if offset_str.startswith('0x') else int(offset_str)
if not translations:
print(f"Empty {csv_path}")
return False
print(f"Using {csv_path} with {len(translations)} lines to translate {exe_path}...")
# Parse the PE file
pe = pefile.PE(exe_path)
# Initialize disassembler for x86
md = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_32)
md.detail = True
md.skipdata = True
# Calculate where to add the new section
last_section = pe.sections[-1]
new_section_offset = (last_section.VirtualAddress +
last_section.Misc_VirtualSize +
pe.OPTIONAL_HEADER.SectionAlignment - 1) & ~(pe.OPTIONAL_HEADER.SectionAlignment - 1)
# Create the new section
new_section_data = bytearray()
va_of_new_strings = {} # Maps original string RVA to new string RVA
patch_locations = [] # List of (file_offset, new_va) tuples
patch_locations_byte = [] # List of (file_offset, new byte) tuples
reference_map = {}
print("Preprocessing sections...")
for sn, section in enumerate(pe.sections):
if not any(section.Name.startswith(allowed) for allowed in allowed_sections):
continue
section_data = section.get_data()
#section_va = section.VirtualAddress + pe.OPTIONAL_HEADER.ImageBase
section_offset = section.PointerToRawData
reference_map = None
if ENABLE_CACHE and cache:
reference_map = cache.get(f"cached_disasm_of_{section.Name}")
if reference_map is None and ENABLE_CACHE:
reference_map = {}
has_prior_push = False
for insn in md.disasm(section_data, 0):
if '.byte' == insn.mnemonic:
continue
for op in insn.operands:
if op.type == capstone.CS_OP_IMM:
imm_pos = insn.imm_offset
if imm_pos is not None:
if op.imm not in reference_map:
reference_map[op.imm] = []
inst_offset_in_section = insn.address + imm_pos
imm_file_offset = section_offset + inst_offset_in_section
#for section in pe.sections:
#if section.PointerToRawData <= string_offset < (section.PointerToRawData + section.SizeOfRawData):
#pass
reference_map[op.imm].append((insn.address, imm_file_offset, sn, has_prior_push and insn.mnemonic == 'push'))
has_prior_push = insn.mnemonic == 'push' and insn.imm_size == 1
cache.set(f"cached_disasm_of_{section.Name}", reference_map)
print(f"Collected {sum(len(v) for v in reference_map.values())} operand references.")
# Add all translations to the new section
current_offset = 0
i = 0
image_base = pe.OPTIONAL_HEADER.ImageBase
for row in tqdm(translations, desc="Processing translations"):
i += 1
if len(row) < 3:
print(f"Error in row {row}")
continue
original = row[0]
translation = row[1]
if original.startswith('//') and not translation.startswith('//'):
continue
string_offset = row[2]
if not string_offset:
print(f"Error in row {row}")
continue
# Convert file offset to RVA by finding the section containing this offset
string_rva = None
for section in pe.sections:
if section.PointerToRawData <= string_offset < (section.PointerToRawData + section.SizeOfRawData):
string_rva = string_offset - section.PointerToRawData + section.VirtualAddress
break
if string_rva is None:
print(f"Warning: Could not map file offset {string_offset} to RVA")
continue
# Store mapping from original string RVA to new string RVA
va_of_new_strings[string_rva] = new_section_offset + current_offset
# Add the translated string to our new section
encoded_translation = translation.encode('utf-16le') + b'\0\0'
new_section_data.extend(encoded_translation)
current_offset += len(encoded_translation)
# Apply patches for string RVA references
sec = pe.sections[sn]
section_data = sec.get_data()
for inst_offset_in_section, instr_offset, sn, has_prior_push in reference_map.get(
string_rva + image_base, []):
new_va = image_base + va_of_new_strings[string_rva]
patch_locations.append((instr_offset, new_va))
# fix string lengths pushed to stack right before the string address NOTE: DANGER ZONE
if has_prior_push and section_data[inst_offset_in_section - 2] == 0x6A: # push #LEN; push #STR => 6A XX 68 XX XX XX XX
if section_data[inst_offset_in_section - 1] == len(original):
# 127 because short push immediate sign-extends the value
tl_len = len(translation)
if tl_len > 127:
print(f"\nNeed to fix line {i} = {translation}")
instr_offset -= 2
patch_locations_byte.append( (instr_offset, min(127, tl_len)) )
pass
if inst_offset_in_section < len(section_data) and section_data[inst_offset_in_section] == 0x68 and section_data[inst_offset_in_section - 3] == 0x6A: # push #LEN; push #STR => 6A XX 68 XX XX XX XX
if section_data[inst_offset_in_section - 2] == len(original):
# 127 because short push immediate sign-extends the value
tl_len = len(translation)
if tl_len > 127:
print(f"\nNeed to fix line {i} = {translation}")
instr_offset -= 3
print(hex(instr_offset))
patch_locations_byte.append( (instr_offset, min(127, tl_len)) )
pass
# Align the section size to file alignment
aligned_size = (len(new_section_data) + pe.OPTIONAL_HEADER.FileAlignment - 1) & ~(pe.OPTIONAL_HEADER.FileAlignment - 1)
new_section_data.extend(b'\0' * (aligned_size - len(new_section_data)))
# Calculate where the new section will be placed in the file
new_section_raw_pointer = (
last_section.PointerToRawData +
last_section.SizeOfRawData +
pe.OPTIONAL_HEADER.FileAlignment - 1) & ~(
pe.OPTIONAL_HEADER.FileAlignment - 1)
# Create a new section header
new_section = pefile.SectionStructure(pe.__IMAGE_SECTION_HEADER_format__)
new_section.set_file_offset(pe.sections[-1].get_file_offset() + pe.sections[-1].sizeof())
# Set the section properties
new_section_name = b'.trans'
new_section.Name = new_section_name[:8] + b'\0' * max(8 - len(new_section_name), 0)
new_section.Misc = len(new_section_data) # virtual size
new_section.VirtualAddress = new_section_offset
new_section.SizeOfRawData = aligned_size
new_section.PointerToRawData = new_section_raw_pointer
new_section.PointerToRelocations = 0
new_section.PointerToLinenumbers = 0
new_section.NumberOfRelocations = 0
new_section.NumberOfLinenumbers = 0
new_section.Characteristics = 0x40000040 # READ | INITIALIZED
# Update the PE header
pe.FILE_HEADER.NumberOfSections += 1
pe.OPTIONAL_HEADER.SizeOfImage = (
new_section_offset + aligned_size +
pe.OPTIONAL_HEADER.SectionAlignment - 1) & ~(
pe.OPTIONAL_HEADER.SectionAlignment - 1)
# Make a copy of the original exe
with open(exe_path, 'rb') as f:
exe_data = bytearray(f.read())
# Apply patches to the original data
for file_offset, new_va in patch_locations:
struct.pack_into('<I', exe_data, file_offset, new_va)
for file_offset, new_byte in patch_locations_byte:
exe_data[file_offset] = new_byte
#for insn_file_offset, new_bytes in instruction_patches:
#for i, byte in enumerate(new_bytes):
#exe_data[insn_file_offset + i] = byte
# Write the modified exe
with open(output_path, 'wb') as f:
# Calculate where the section table is
section_table_offset = pe.DOS_HEADER.e_lfanew + 4 + pe.FILE_HEADER.sizeof() + pe.FILE_HEADER.SizeOfOptionalHeader
# Update NumberOfSections in the file header
struct.pack_into('<H', exe_data, pe.DOS_HEADER.e_lfanew + 6, pe.FILE_HEADER.NumberOfSections)
# Update SizeOfImage in the optional header
image_size_offset = pe.DOS_HEADER.e_lfanew + 4 + pe.FILE_HEADER.sizeof() + 56 # Offset to SizeOfImage
struct.pack_into('<I', exe_data, image_size_offset, pe.OPTIONAL_HEADER.SizeOfImage)
# Write everything up to the section table
f.write(exe_data[:section_table_offset])
# Write all original section headers
for section in pe.sections:
f.write(section.__pack__())
# Write our new section header
new_section_header = new_section.__pack__()
f.write(new_section_header)
# Write everything after the section table but before our new section
# First calculate where the original section table ends
original_section_table_end = section_table_offset + (pe.FILE_HEADER.NumberOfSections - 1) * 40
# Write data between section table and our new section
if new_section_raw_pointer > original_section_table_end:
f.write(exe_data[original_section_table_end+len(new_section_header):new_section_raw_pointer])
else:
# Seek to where our section should start
f.seek(new_section_raw_pointer)
# Write our new section data
f.write(new_section_data)
if pe.OPTIONAL_HEADER.CheckSum:
# Recalculate the checksum
try:
# Use pefile to recalculate the checksum
patched_pe = pefile.PE(output_path)
patched_pe.OPTIONAL_HEADER.CheckSum = patched_pe.generate_checksum()
patched_pe.write(filename=output_path)
except Exception as e:
print(f"Warning: Could not recalculate checksum: {e}")
print(f"Patched {len(patch_locations)} references to {len(va_of_new_strings)} strings")
return True
def find_instruction_references(exe_path, target_addresses, output_csv=None):
"""
Find all instructions in the code section that reference specific addresses
Args:
exe_path: Path to the executable
target_addresses: List of virtual addresses to find references to
output_csv: Optional path to save results to CSV
"""
pe = pefile.PE(exe_path)
# Initialize disassembler
md = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_32)
md.detail = True
results = []
# Process each code section
for section in [s for s in pe.sections if s.Name.startswith(b'.text')]:
section_data = section.get_data()
section_va = section.VirtualAddress + pe.OPTIONAL_HEADER.ImageBase
# Disassemble the section
for insn in md.disasm(section_data, section_va):
# Check each operand for references to target addresses
for op in insn.operands:
if hasattr(op, 'imm') and op.imm in target_addresses:
# Calculate file offset of the instruction
file_offset = section.PointerToRawData + (insn.address - section_va)
results.append({
'address': insn.address,
'file_offset': file_offset,
'mnemonic': insn.mnemonic,
'op_str': insn.op_str,
'bytes': ' '.join(f'{b:02x}' for b in insn.bytes),
'target_address': op.imm
})
# Optionally save to CSV
if output_csv and results:
headers = ['address', 'file_offset', 'mnemonic', 'op_str', 'bytes', 'target_address']
csv_data = [headers] + [[str(row[h]) for h in headers] for row in results]
write_csv_list(output_csv, csv_data)
return results
def main():
exe_path = 'Original.exe'
csv_path = exe_path.replace('.exe', '_strings.csv')
output_path = f'translation_out\\{exe_path}'
if patch_exe_with_translations(exe_path, csv_path, output_path):
print(f"Successfully patched {exe_path} with translations to {output_path}")
if __name__ == "__main__":
main()
import pefile
import re
import struct
from filetranslate.service_fn import read_csv_list, write_csv_list
def extract_strings_from_exe(exe_path):
import pefile
import re
import struct
# Load the PE file
pe = pefile.PE(exe_path)
# Initialize results array [string, '', file_offset]
results = []
found_strings = set() # To avoid duplicates
# Helper function to validate string quality
def is_valid_string(s, min_length=1):
if len(s) < min_length:
return False
# All characters should be printable or whitespace
if not all(c.isprintable() or c.isspace() for c in s):
return False
# For longer strings, apply more heuristics
if len(s) > 8:
# Check for too many special characters
special_char_count = sum(not c.isalnum() and not c.isspace() for c in s)
if special_char_count / len(s) > 0.5: # More than 50% special chars
return False
return True
# PART 1: Find all strings in data sections
for section in pe.sections:
# Skip code sections - focus on data sections
if section.Name.startswith(b'.text'):
continue
section_data = section.get_data()
section_offset = section.PointerToRawData
# Find UTF-16 strings with better validation
# Find ASCII strings
unicode_pattern = re.compile(b'\0\0[\s\S]{4,512}\0\0', re.DOTALL)
for match in unicode_pattern.finditer(section_data):
start = match.start()
end = match.end() - 1 # Exclude null terminator
try:
string_value = section_data[start:end].decode('utf-16le')
if is_valid_string(string_value):
file_offset = section_offset + start
if string_value not in found_strings:
results.append([string_value, '', file_offset])
found_strings.add(string_value)
except UnicodeDecodeError:
pass
# Find ASCII strings
ascii_pattern = re.compile(b'[^\x00-\x1F\x7F-\xFF]{4,}?\x00', re.DOTALL)
for match in ascii_pattern.finditer(section_data):
start = match.start()
end = match.end() - 1 # Exclude null terminator
try:
string_value = section_data[start:end].decode('ascii')
if is_valid_string(string_value):
file_offset = section_offset + start
if string_value not in found_strings:
results.append([string_value, '', file_offset])
found_strings.add(string_value)
except UnicodeDecodeError:
pass
# PART 2: Find strings by reference patterns in code sections
# Define instruction patterns to search for
instr_patterns = [
{'pattern': re.compile(b'\x68(....)', re.DOTALL), 'operand_offset': 1, 'size': 4}, # PUSH immediate (68 xx xx xx xx)
{'pattern': re.compile(b'[\xB8-\xBF](....)', re.DOTALL), 'operand_offset': 1, 'size': 4}, # MOV reg, immediate (B8-BF xx xx xx xx)
{'pattern': re.compile(b'\x8D[\x05\x0D\x15\x1D\x25\x2D\x35\x3D](....)', re.DOTALL), 'operand_offset': 2, 'size': 4} # LEA reg, [addr] (8D xx xx xx xx xx)
]
def parse_potential_string_reference(match, pattern_info, section):
addr_operand_offset = match.start() + pattern_info['operand_offset']
string_va = struct.unpack('<I', match.group(1))[0]
# Check if this points to a valid string
string_rva = string_va - pe.OPTIONAL_HEADER.ImageBase
try:
file_offset = pe.get_offset_from_rva(string_rva)
string_data = pe.get_data(string_rva, 1024)
if not string_data or len(string_data) < 2:
return None
# Find end of UTF-16LE string
null_pos = string_data.find(b'\0\0')
if null_pos >= 2:
try:
string_value = string_data[:null_pos].decode('utf-16le')
if is_valid_string(string_value):
return string_value, file_offset
except UnicodeDecodeError:
pass
try: # maybe stray ascii ending?
string_value = string_data[:null_pos+1].decode('utf-16le')
if is_valid_string(string_value):
return string_value, file_offset
except UnicodeDecodeError:
pass
try: # maybe stray ascii ending?
string_value = string_data[:null_pos+2].decode('utf-16le')
if is_valid_string(string_value):
return string_value, file_offset
except UnicodeDecodeError:
pass
# Try ASCII if not UTF-16LE
null_pos = string_data.find(b'\x00')
if null_pos > 0:
string_value = string_data[:null_pos].decode('ascii', errors='ignore')
if is_valid_string(string_value):
return string_value, file_offset
except:
pass
return None
# Search for string references in code sections
for section in [s for s in pe.sections if s.Name.startswith(b'.text')]:
section_data = section.get_data()
# Process each instruction pattern
for pattern_info in instr_patterns:
for match in pattern_info['pattern'].finditer(section_data):
result = parse_potential_string_reference(match, pattern_info, section)
if result:
string_value, file_offset = result
string_value = string_value.replace('\r', '')
if string_value not in found_strings:
results.append([string_value, '', file_offset])
found_strings.add(string_value)
return results
def validate_string(string):
valid_pattern = re.compile(r'[\u3041-\u3096\u30A0-\u30FF\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A\u2E80-\u2FD5\uFF5F-\uFF9F\u3000-\u303F\u31F0-\u31FF\u3220-\u3243\u3280-\u337F\uFF01-\uFF5E\u2026-\u203Ba-zA-Z\d\s.,!?()\-\[\[\!@#\$%\^&\*:;\n\'\"()_\+=,\.\/?\\\|\[\]`~]+')
return bool(valid_pattern.match(string))
# Usage example:
file_path = 'Original.exe'
csv_path = 'Original_strings.csv'
strings = extract_strings_from_exe(file_path)
tled = read_csv_list(csv_path)
for i, item1 in enumerate(strings):
for item in tled:
if item1[0] == item[0]:
strings[i][1] = item[1]
break
write_csv_list(csv_path, strings)
diskcache
tqdm
capstone
filetranslate
@killerswin2
Copy link

Holy shit, thank you for making this! This will literally save me so much time by not having to reverse engineering, and use weird exe "hacks".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment