1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
use super::*;
use crate::analysis;
use crate::utils::log::LogMessage;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};

/// Contains implementation of the block duplication normalization pass.
mod block_duplication_normalization;
use block_duplication_normalization::*;
pub mod propagate_control_flow;
use propagate_control_flow::*;

/// The `Project` struct is the main data structure representing a binary.
///
/// It contains information about the disassembled binary
/// and about the execution environment of the binary.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone)]
pub struct Project {
    /// All (known) executable code of the binary is contained in the `program` term.
    pub program: Term<Program>,
    /// The CPU architecture on which the binary is assumed to be executed.
    pub cpu_architecture: String,
    /// The stack pointer register for the given CPU architecture.
    pub stack_pointer_register: Variable,
    /// The known calling conventions that may be used for calls to extern functions.
    pub calling_conventions: BTreeMap<String, CallingConvention>,
    /// The set of all known physical registers for the CPU architecture.
    /// Does only contain base registers, i.e. sub registers of other registers are not contained.
    pub register_set: BTreeSet<Variable>,
    /// Contains the properties of C data types. (e.g. size)
    pub datatype_properties: DatatypeProperties,
    /// Represents the memory after loading the binary.
    pub runtime_memory_image: RuntimeMemoryImage,
}

impl Project {
    /// Return the size (in bytes) for pointers of the given architecture.
    pub fn get_pointer_bytesize(&self) -> ByteSize {
        self.stack_pointer_register.size
    }

    /// Try to guess a standard calling convention from the list of calling conventions in the project.
    pub fn get_standard_calling_convention(&self) -> Option<&CallingConvention> {
        self.calling_conventions
            .get("__stdcall")
            .or_else(|| self.calling_conventions.get("__cdecl"))
            .or_else(|| self.calling_conventions.get("__thiscall")) // for x86_64 Microsoft Windows binaries.
    }

    /// Try to find a specific calling convention in the list of calling conventions in the project.
    /// If not given a calling convention (i.e. given `None`) or the given calling convention name was not found
    /// then falls back to `get_standard_calling_convention`.
    pub fn get_specific_calling_convention(
        &self,
        cconv_name_opt: &Option<String>,
    ) -> Option<&CallingConvention> {
        // FIXME: On x86 Windows binaries we can get a strange edge case:
        // For some reason we get cases where Ghidra annotates a function with `__cdecl` as calling convention,
        // but the general calling convention list only contains `__fastcall` and `__thiscall`.
        // We should investigate this, so that we do not have to fall back to the standard calling convention.
        cconv_name_opt
            .as_ref()
            .and_then(|cconv_name| self.calling_conventions.get(cconv_name))
            .or_else(|| self.get_standard_calling_convention())
    }

    /// Return the calling convention associated to the given extern symbol.
    /// If the extern symbol has no annotated calling convention
    /// then return the standard calling convention of the project instead.
    ///
    /// This function panics if no suitable calling convention is found.
    pub fn get_calling_convention(&self, extern_symbol: &ExternSymbol) -> &CallingConvention {
        if let Some(cconv_name) = &extern_symbol.calling_convention {
            self.calling_conventions.get(cconv_name).unwrap()
        } else {
            self.get_standard_calling_convention().unwrap()
        }
    }
}

impl Project {
    /// For all expressions contained in the project,
    /// replace trivially computable subexpressions like `a XOR a` with their result.
    pub fn substitute_trivial_expressions(&mut self) {
        for sub in self.program.term.subs.values_mut() {
            for block in sub.term.blocks.iter_mut() {
                for def in block.term.defs.iter_mut() {
                    match &mut def.term {
                        Def::Assign { value: expr, .. } | Def::Load { address: expr, .. } => {
                            expr.substitute_trivial_operations()
                        }
                        Def::Store { address, value } => {
                            address.substitute_trivial_operations();
                            value.substitute_trivial_operations();
                        }
                    }
                }
                for jmp in block.term.jmps.iter_mut() {
                    match &mut jmp.term {
                        Jmp::Branch(_) | Jmp::Call { .. } | Jmp::CallOther { .. } => (),
                        Jmp::BranchInd(expr)
                        | Jmp::CBranch {
                            condition: expr, ..
                        }
                        | Jmp::CallInd { target: expr, .. }
                        | Jmp::Return(expr) => expr.substitute_trivial_operations(),
                    }
                }
            }
        }
    }

    /// Replaces the return-to TID of calls to non-returning functions with the
    /// TID of the artificial sink block in the caller.
    ///
    /// We distinguish two kinds of non-returning functions:
    ///
    /// - extern symbols that are marked as non-returning, e.g.,`exit(..)`,
    /// - functions without a return instruction.
    ///
    /// For calls to the latter functions, no [`CallReturn`] nodes and
    /// corresponding edges will be generated in the CFG. This implies that no
    /// interprocedural analysis will happen for those calls. Furthermore, the
    /// missing incoming edge to the return node implies that the node may be
    /// optimized away by the [control flow propagation pass]. The reference to
    /// the return site is now "dangling" and may lead to panics when
    /// constructing the CFG (Issue #461).
    ///
    /// Thus, we lose nothing if we retarget the return block, even if our
    /// analysis is incorrect and the callee in fact returns to the originally
    /// indicated site. Cases where we misclassify a callee include:
    ///
    /// - functions ending in an indirect tail jump,
    /// - epilogs like `lrd pc, [sp], #0x04` that are essentially a ret but
    ///   Ghidra sometimes thinks its an indirect jump,
    /// - cases where the callee code that we get from Gidra is incomplete.
    ///
    /// This heuristic works better when the block-to-sub mapping is unique
    /// since this pass may inline return site into callees that end in a tail
    /// jump, i.e., call this after [`make_block_to_sub_mapping_unique`].
    /// This pass preserves a unique block-to-sub mapping.
    ///
    /// [`CallReturn`]: crate::analysis::graph::Node::CallReturn
    /// [control flow propagation pass]: mod@propagate_control_flow
    #[must_use]
    fn retarget_non_returning_calls_to_artificial_sink(&mut self) -> Vec<LogMessage> {
        let (non_returning_subs, mut log_messages) = self.find_non_returning_subs();

        // INVARIANT: A unique block-to-sub mapping is preserved.
        for sub in self
            .program
            .term
            .subs
            .values_mut()
            .filter(|sub| !sub.tid.is_artificial_sink_sub())
        {
            let sub_id_suffix = sub.id_suffix();
            let mut one_or_more_call_retargeted = false;

            for block in sub.term.blocks.iter_mut() {
                for jmp in block.term.jmps.iter_mut() {
                    let Jmp::Call {
                        target,
                        return_: Some(return_tid),
                    } = &mut jmp.term
                    else {
                        continue;
                    };

                    if return_tid.is_artificial_sink_block(&sub_id_suffix) {
                        // The call is already returning to the function's
                        // artificial sink so there is nothing to do.
                        continue;
                    } else if let Some(extern_symbol) = self.program.term.extern_symbols.get(target)
                    {
                        if extern_symbol.no_return {
                            // Reroute returns from calls to non-returning
                            // library functions.
                            *return_tid = Tid::artificial_sink_block(&sub_id_suffix);

                            one_or_more_call_retargeted = true;
                        }
                    } else if non_returning_subs.contains(target) {
                        // Reroute returns from calls to non-returning
                        // functions within the program.
                        log_messages.push(LogMessage::new_info(format!(
                            "Call @ {} to {} does not return to {}.",
                            jmp.tid, target, return_tid
                        )));

                        *return_tid = Tid::artificial_sink_block(&sub_id_suffix);

                        one_or_more_call_retargeted = true;
                    }
                }
            }

            // Add artificial sink block if required.
            if one_or_more_call_retargeted {
                sub.add_artifical_sink();
            }
        }

        log_messages
    }

    /// Returns the set of all subs without a return instruction.
    fn find_non_returning_subs(&self) -> (HashSet<Tid>, Vec<LogMessage>) {
        let mut log_messages = Vec::new();
        let non_returning_subs = self
            .program
            .term
            .subs
            .values()
            .filter_map(|sub| {
                let sub_returns = sub.term.blocks.iter().any(|block| {
                    block
                        .term
                        .jmps
                        .iter()
                        .any(|jmp| matches!(jmp.term, Jmp::Return(..)))
                });

                if sub_returns || sub.tid.is_artificial_sink_sub() {
                    None
                } else {
                    log_messages.push(LogMessage::new_info(format!(
                        "{} is non-returning.",
                        sub.tid
                    )));

                    Some(sub.tid.clone())
                }
            })
            .collect();

        (non_returning_subs, log_messages)
    }

    /// Adds a function that serves as an artificial sink in the CFG.
    fn add_artifical_sink(&mut self) {
        self.program
            .term
            .subs
            .insert(Tid::artificial_sink_sub(), Term::<Sub>::artificial_sink());
    }

    /// Returns the set of all valid jump targets.
    fn find_all_jump_targets(&self) -> HashSet<Tid> {
        let mut jump_target_tids = HashSet::new();

        for sub in self.program.term.subs.values() {
            jump_target_tids.insert(sub.tid.clone());
            for block in sub.term.blocks.iter() {
                jump_target_tids.insert(block.tid.clone());
            }
        }

        for symbol_tid in self.program.term.extern_symbols.keys() {
            jump_target_tids.insert(symbol_tid.clone());
        }

        jump_target_tids
    }

    /// Replace jumps to nonexisting TIDs with jumps to a dummy target
    /// representing an artificial sink in the control flow graph.
    /// Return a log message for each replaced jump target.
    ///
    /// Nonexisting jump targets may be generated by the Ghidra backend
    /// if the data at the target address is not a valid assembly instruction.
    #[must_use]
    fn remove_references_to_nonexisting_tids(&mut self) -> Vec<LogMessage> {
        let mut log_messages = Vec::new();
        let all_jump_targets = self.find_all_jump_targets();

        // Replace all jumps to non-existing jump targets with jumps to dummy
        // targets.
        for sub in self.program.term.subs.values_mut() {
            for block in sub.term.blocks.iter_mut() {
                if let Err(mut logs) =
                    block.remove_nonexisting_indirect_jump_targets(&all_jump_targets)
                {
                    log_messages.append(&mut logs);
                }
                for jmp in block.term.jmps.iter_mut() {
                    if let Err(log_msg) =
                        jmp.retarget_nonexisting_jump_targets_to_artificial_sink(&all_jump_targets)
                    {
                        log_messages.push(log_msg);
                    }
                }
            }
        }

        log_messages
    }

    /// Remove blocks, defs and jumps with duplicate TIDs and return log messages on such cases.
    /// Since such cases break the fundamental invariant that each TID is unique,
    /// they result in errors if not removed.
    ///
    /// Note that each case has a bug as a root cause.
    /// This code is only a workaround so that before the corresponding bug is fixed
    /// the rest of the binary can still be analyzed.
    #[must_use]
    fn remove_duplicate_tids(&mut self) -> Vec<LogMessage> {
        let mut known_tids = HashSet::new();
        let mut errors = Vec::new();
        known_tids.insert(self.program.tid.clone());
        for sub in self.program.term.subs.values_mut() {
            if !known_tids.insert(sub.tid.clone()) {
                panic!("Duplicate of TID {} encountered.", sub.tid);
            }
            let mut filtered_blocks = Vec::new();
            for block in &sub.term.blocks {
                if known_tids.insert(block.tid.clone()) {
                    filtered_blocks.push(block.clone());
                } else {
                    errors.push(LogMessage::new_error(&format!(
                        "Removed duplicate of TID {}. This is a bug in the cwe_checker!",
                        block.tid
                    )));
                }
            }
            sub.term.blocks = filtered_blocks;
            for block in sub.term.blocks.iter_mut() {
                let mut filtered_defs = Vec::new();
                let mut filtered_jmps = Vec::new();
                for def in &block.term.defs {
                    if known_tids.insert(def.tid.clone()) {
                        filtered_defs.push(def.clone());
                    } else {
                        errors.push(LogMessage::new_error(&format!(
                            "Removed duplicate of TID {}. This is a Bug in the cwe_checker!",
                            def.tid
                        )));
                    }
                }
                for jmp in &block.term.jmps {
                    if known_tids.insert(jmp.tid.clone()) {
                        filtered_jmps.push(jmp.clone());
                    } else {
                        errors.push(LogMessage::new_error(&format!(
                            "Removed duplicate of TID {}. This is a Bug in the cwe_checker!",
                            jmp.tid
                        )));
                    }
                }
                block.term.defs = filtered_defs;
                block.term.jmps = filtered_jmps;
            }
        }

        errors
    }

    /// Performs only the normalizations necessary to analyze the project.
    ///
    /// Runs only the normalization passes that bring the project to a form
    /// in which it can be consumed by the later analyses. Currently those are:
    ///
    /// - Removal of duplicate TIDs. (This is a workaround for a bug in the
    ///   P-Code-Extractor and should be removed once the bug is fixed.)
    /// - Replacement of references to nonexisting TIDs with jumps to artificial
    ///   sink targets in the CFG.
    /// - Duplication of blocks so that if a block is contained in several
    ///   functions, each function gets its own unique copy.
    /// - Replacement of return addresses for calls to non-returning functions
    ///   with artificial sink targets.
    ///
    /// After those passes all of the later analyses can be computed. However,
    /// they are expected to run faster if you also run
    /// [`Project::normalize_optimize`] beforehand.
    #[must_use]
    pub fn normalize_basic(&mut self) -> Vec<LogMessage> {
        let mut logs = self.remove_duplicate_tids();
        self.add_artifical_sink();
        logs.append(self.remove_references_to_nonexisting_tids().as_mut());
        make_block_to_sub_mapping_unique(self);
        logs.append(
            self.retarget_non_returning_calls_to_artificial_sink()
                .as_mut(),
        );

        logs
    }

    /// Performs only the optimizing normalization passes.
    ///
    /// [`Project::normalize_basic`] **must** be called before this method.
    ///
    /// Runs only the optimization passes that transform the program to an
    /// equivalent, simpler representation. This step is exprected to improve
    /// the speed and precision of later analyses.
    ///
    /// Currently, the following optimizations are performed:
    ///
    /// - Propagate input expressions along variable assignments.
    /// - Replace trivial expressions like `a XOR a` with their result.
    /// - Remove dead register assignments.
    /// - Propagate the control flow along chains of conditionals with the same condition.
    /// - Substitute bitwise `AND` and `OR` operations with the stack pointer
    ///   in cases where the result is known due to known stack pointer alignment.
    #[must_use]
    pub fn normalize_optimize(&mut self) -> Vec<LogMessage> {
        analysis::expression_propagation::propagate_input_expression(self);
        self.substitute_trivial_expressions();
        analysis::dead_variable_elimination::remove_dead_var_assignments(self);
        propagate_control_flow(self);
        analysis::stack_alignment_substitution::substitute_and_on_stackpointer(self)
            .unwrap_or_default()
    }

    /// Run all normalization passes over the project.
    ///
    /// Convenience wrapper that calls [`Project::normalize_basic`] and
    /// [`Project::normalize_optimize`].
    #[must_use]
    pub fn normalize(&mut self) -> Vec<LogMessage> {
        let mut logs = self.normalize_basic();
        logs.append(self.normalize_optimize().as_mut());

        logs
    }
}

impl Term<Jmp> {
    /// If the TID of a jump target, call target, or return target is not
    /// contained in in the known jump targets, replace it with a dummy TID and
    /// return an error message.
    fn retarget_nonexisting_jump_targets_to_artificial_sink(
        &mut self,
        all_jump_targets: &HashSet<Tid>,
    ) -> Result<(), LogMessage> {
        use Jmp::*;
        match &mut self.term {
            BranchInd(_) => Ok(()),
            Branch(tid) | CBranch { target: tid, .. } if !all_jump_targets.contains(tid) => {
                let error_msg = format!("Jump target at {} does not exist", tid.address);
                let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());

                *tid = Tid::artificial_sink_block("");

                Err(error_log)
            }
            Call { target, return_ } if !all_jump_targets.contains(target) => {
                let error_msg = format!("Call target at {} does not exist", target.address);
                let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());

                *target = Tid::artificial_sink_sub();
                *return_ = None;

                Err(error_log)
            }
            Call {
                return_: Some(return_tid),
                ..
            }
            | CallInd {
                return_: Some(return_tid),
                ..
            }
            | CallOther {
                return_: Some(return_tid),
                ..
            } if !all_jump_targets.contains(return_tid) => {
                let error_msg = format!("Return target at {} does not exist", return_tid.address);
                let error_log = LogMessage::new_error(error_msg).location(self.tid.clone());

                *return_tid = Tid::artificial_sink_block("");

                Err(error_log)
            }
            _ => Ok(()),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn retarget_nonexisting_jumps() {
        let mut jmp_term = Term {
            tid: Tid::new("jmp"),
            term: Jmp::Branch(Tid::new("nonexisting_target")),
        };
        assert_eq!(jmp_term.term, Jmp::Branch(Tid::new("nonexisting_target")));
        assert!(jmp_term
            .retarget_nonexisting_jump_targets_to_artificial_sink(&HashSet::new(),)
            .is_err());
        assert_eq!(jmp_term.term, Jmp::Branch(Tid::artificial_sink_block("")));
    }
}