Skip to content

Commit

Permalink
rename decomp correlate, add bsim option
Browse files Browse the repository at this point in the history
  • Loading branch information
clearbluejar committed Jan 4, 2024
1 parent 3e05870 commit 8677430
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ghidra_builtins import *


def correlate_unmatched(self, matches, p1_missing, p2_missing, p1_matches, p2_matches):
def decomp_correlate(self, matches, p1_missing, p2_missing, p1_matches, p2_matches):
"""
from all of the unmatched functions remaining, see if any should be matched by decomp
This is slow, but sometimes necessary
Expand All @@ -14,7 +14,7 @@ def correlate_unmatched(self, matches, p1_missing, p2_missing, p1_matches, p2_ma
# only attempt if there is something to match
if len(p1_missing) > 0 and len(p2_missing) > 0:

self.logger.info(f'Attempting to correlate unmatched functions p1:{len(p1_missing)} p2:{len(p1_missing)}')
self.logger.info(f'Attempting to Decomp Correlate unmatched functions p1:{len(p1_missing)} p2:{len(p1_missing)}')

for p1_func in p1_missing:

Expand Down
14 changes: 11 additions & 3 deletions ghidriff/ghidra_diff_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def __init__(
max_section_funcs: int = 200,
min_func_len: int = 10,
use_calling_counts: bool = True,
bsim: bool = True) -> None:
bsim: bool = True,
bsim_full: bool = False) -> None:


# setup engine logging
Expand Down Expand Up @@ -157,6 +158,7 @@ def __init__(
# if looking up more than calling_count_funcs_limit symbols, skip function counts
self.use_calling_counts = use_calling_counts
self.bsim = bsim
self.bsim_full = bsim_full

self.logger.debug(f'{vars(self)}')

Expand Down Expand Up @@ -189,9 +191,13 @@ def add_ghidra_args_to_parser(parser: argparse.ArgumentParser) -> None:
group.add_argument('--min-func-len', help='Minimum function length to consider for diff',
type=int, default=10),
group.add_argument('--use-calling-counts', help='Add calling/called reference counts', default=False,
action=argparse.BooleanOptionalAction)
action=argparse.BooleanOptionalAction)

group = parser.add_argument_group('BSIM Options')
group.add_argument('--bsim', help='Toggle using BSIM correlation', default=True,
action=argparse.BooleanOptionalAction)
group.add_argument('--bsim-full', help='Slower but better matching. Use only when needed', default=False,
action=argparse.BooleanOptionalAction)

# TODO add following option
# group.add_argument('--exact-matches', help='Only consider exact matches', action='store_true')
Expand Down Expand Up @@ -457,7 +463,7 @@ def setup_project(
if not project.getRootFolder().getFile(program_name):
self.logger.info(f'Importing {program_path} as {program_name}')
program = project.importProgram(program_path)
project.saveAs(program, "/", program_name, True)
project.saveAs(program, "/", program_name, True)
else:
self.logger.info(f'Opening {program_path}')
program = self.project.openProgram("/", program_name, False)
Expand Down Expand Up @@ -825,6 +831,8 @@ def analyze_program(self, df_or_prog: Union["ghidra.framework.model.DomainFile",
else:
self.logger.info(f"Analysis already complete.. skipping {program}!")
finally:
# from java.io import File
# self.project.saveAsPackedFile(program,File(f'/tmp/{program.name}.gzf'), True)
self.project.close(program)

self.logger.info(f"Analysis for {df_or_prog} complete")
Expand Down
23 changes: 16 additions & 7 deletions ghidriff/version_tracking_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from .ghidra_diff_engine import GhidraDiffEngine
from .implied_matches import correlate_implied_matches
from .correlate_unmatched import correlate_unmatched
from .decomp_correlate import decomp_correlate
from .bsim import correlate_bsim

if TYPE_CHECKING:
Expand Down Expand Up @@ -61,12 +61,12 @@ def find_matches(
# tuples of correlators instances
# ( name, hasher, one_to_one, one_to_many)
# DO NOT CHANGE ORDER UNLESS INTENDED, ORDER HAS MAJOR IMPACT ON ACCURACY AND EFFICIENCY
func_correlators = [
func_correlators = [
('ExactBytesFunctionHasher', ExactBytesFunctionHasher.INSTANCE, True, False),
('ExactInstructionsFunctionHasher', ExactInstructionsFunctionHasher.INSTANCE, True, False),
(StructuralGraphExactHasher.MATCH_TYPE, StructuralGraphExactHasher(), True, False),
('ExactMnemonicsFunctionHasher', ExactMnemonicsFunctionHasher.INSTANCE, True, False),
('BSIM', None, True, True), # not a true function hasher
('BSIM', None, True, True), # not a true function hasher
(BulkInstructionsHasher.MATCH_TYPE, BulkInstructionsHasher(), True, False),
(SigCallingCalledHasher.MATCH_TYPE, SigCallingCalledHasher(), True, False),
(StringsRefsHasher.MATCH_TYPE, StringsRefsHasher(), True, False),
Expand Down Expand Up @@ -119,6 +119,8 @@ def find_matches(


# Run Function Hash Correlators

func_matches = None
# Each round of matching will "accept" the matches and subtract them from the unmatched functions
# This is why the order of correlators matter
for cor in func_correlators:
Expand All @@ -131,8 +133,13 @@ def find_matches(
self.logger.debug(f'hasher: {hasher}')
self.logger.info(f'name: {name} one_to_one: {one_to_one} one_to_many: {one_to_many}')

if name == 'BSIM':
correlate_bsim(matches, p1,p2, p1_matches, p2_matches, monitor, self.logger, p1_addr_set=p1_unmatched, p2_addr_set=p2_unmatched, enabled=self.bsim)
if name == 'BSIM':
if self.bsim_full:
# slower, but uses full adddress space for matching
correlate_bsim(matches, p1,p2, p1_matches, p2_matches, monitor, self.logger,enabled=self.bsim)
else:
# only matches on functions that have no match
correlate_bsim(matches, p1,p2, p1_matches, p2_matches, monitor, self.logger, p1_addr_set=p1_unmatched, p2_addr_set=p2_unmatched, enabled=self.bsim)
else:
func_matches = MatchFunctions.matchFunctions(
p1, p1_unmatched, p2, p2_unmatched, self.MIN_FUNC_LEN, one_to_one, one_to_many, hasher, monitor)
Expand All @@ -142,6 +149,8 @@ def find_matches(
p2_matches.add(match.bFunctionAddress)
matches.setdefault((match.aFunctionAddress, match.bFunctionAddress), {}).setdefault(name, 0)
matches[(match.aFunctionAddress, match.bFunctionAddress)][name] += 1

self.logger.info(f'Match count: {func_matches.size()}')

end = time()

Expand All @@ -150,7 +159,7 @@ def find_matches(
p2_unmatched = p2_unmatched.subtract(p2_matches)

self.logger.info(f'{name} Exec time: {end-start:.4f} secs')
self.logger.info(f'Match count: {func_matches.size()}')


# kill noisy monitor after first run
monitor = ConsoleTaskMonitor().DUMMY_MONITOR
Expand Down Expand Up @@ -184,7 +193,7 @@ def find_matches(
p2_missing = self.get_funcs_from_addr_set(p2, p2_unmatched)

# attempt to correlate amongst unmatched functions
correlate_unmatched(self, matches, p1_missing, p2_missing, p1_matches, p2_matches)
decomp_correlate(self, matches, p1_missing, p2_missing, p1_matches, p2_matches)

p1_unmatched = p1_unmatched.subtract(p1_matches)
p2_unmatched = p2_unmatched.subtract(p2_matches)
Expand Down

0 comments on commit 8677430

Please sign in to comment.