Skip to content

Commit

Permalink
chore(osemgrep): send all rules and targets together to Run_semgrep (s…
Browse files Browse the repository at this point in the history
…emgrep#8197)

The run was previously broken up by language, but this is ugly and
unnecessary. I'm refactoring this ahead of changes to implement
osemgrep-pro, which will be easier if this is one command.

(I could also have wrapped it in a function but it wasn't too hard to
fix.)

Test plan: make test

PR checklist:

- [x] Purpose of the code is [evident to future
readers](https://semgrep.dev/docs/contributing/contributing-code/#explaining-code)
- [x] Tests included or PR comment includes a reproducible test plan
- [x] Documentation is up-to-date
- [x] A changelog entry was [added to
changelog.d](https://semgrep.dev/docs/contributing/contributing-code/#adding-a-changelog-entry)
for any user-facing change
- [x] Change has no security implications (otherwise, ping security
team)

If you're unsure about any of this, please see:

- [Contribution
guidelines](https://semgrep.dev/docs/contributing/contributing-code)!
- [One of the more specific guides located
here](https://semgrep.dev/docs/contributing/contributing/)
  • Loading branch information
emjin authored Jul 10, 2023
1 parent 521eba0 commit c2a55bc
Showing 1 changed file with 62 additions and 79 deletions.
141 changes: 62 additions & 79 deletions src/osemgrep/cli_scan/Core_runner.ml
Original file line number Diff line number Diff line change
Expand Up @@ -57,41 +57,6 @@ type result = {
(* Helpers *)
(*************************************************************************)

(* TODO: should return also exn opt,
* like the return type of semgrep_with_raw_results_and_exn_handler.
* TODO: we should not even need this function because
* Run_semgrep.semgrep_with_raw_results_and_exn_handler can already
* handle a list of targets in different language and so no need to
* merge results in the first place.
*
* TODO? do we have utility functions like that already in Report.mli?
* should move it there? or should not need it at all, see TODO above?
*)
let merge_results (xresults : (Report.final_result * Fpath.t Set_.t) list) :
Report.final_result * Fpath.t Set_.t =
let results = xresults |> Common.map fst in
let files =
xresults |> Common.map snd |> List.fold_left Set_.union Set_.empty
in
let final_result =
{
RP.matches = List.concat_map (fun x -> x.RP.matches) results;
errors = List.concat_map (fun x -> x.RP.errors) results;
skipped_rules = List.concat_map (fun x -> x.RP.skipped_rules) results;
extra = No_info;
explanations = List.concat_map (fun x -> x.RP.explanations) results;
rules_by_engine =
List.concat_map (fun x -> x.Report.rules_by_engine) results;
}
in
(* TODO(dinosaure): currently, we don't collect metrics when we invoke
semgrep-core but we should. However, if we implement a way to collect
metrics, we will just need to set [final_result.extra] to
[RP.Debug]/[RP.Time] and this line of code will not change. *)
Metrics_.add_max_memory_bytes (RP.debug_info_to_option final_result.extra);
Metrics_.add_targets files (RP.debug_info_to_option final_result.extra);
(final_result, files)

(* The same rule may appear under multiple target languages because
some patterns can be interpreted in multiple languages.
TODO? could use Common.group_by
Expand Down Expand Up @@ -157,44 +122,56 @@ let runner_config_of_conf (conf : conf) : Runner_config.t =
version = Version.version;
}

(* Similar to semgrep_with_raw_results_and_exn_handler but takes rules
and targets already filtered for a specific language.
All other options are read from the runner_config object.
TODO: we should not need this function because
semgrep_with_raw_results_and_exn_handler can take a list of targets
in different language (via config.targets set via --targets)
This is ugly, with potentially some filtering operations being done twice.
It should get simplified when we get rid of the pysemgrep completely.
*)
let semgrep_with_prepared_rules_and_targets (config : Runner_config.t)
(x : Lang_job.t) : Exception.t option * Report.final_result * Fpath.t list =
(* compute the rule idx and rule_nums for target_mappings
* (see Input_to_core.atd)
*)
let rule_ids =
x.rules
|> Common.map (fun (x : Rule.t) ->
let id, _tok = x.id in
(id :> string))
let prepare_config_for_semgrep_core (config : Runner_config.t)
(lang_jobs : Lang_job.t list) =
let target_mappings_of_lang_job (x : Lang_job.t) prev_rule_count :
int * Input_to_core_t.target list * Rule.rules =
let rule_ids =
x.rules
|> Common.map (fun (x : Rule.t) ->
let id, _tok = x.id in
(id :> string))
in
let rule_nums = rule_ids |> Common.mapi (fun i _ -> i + prev_rule_count) in
let target_mappings =
x.targets
|> Common.map (fun (path : Fpath.t) : Input_to_core_t.target ->
{ path = !!path; language = x.xlang; rule_nums })
in
(List.length rule_ids, target_mappings, x.rules)
in
let rule_nums = rule_ids |> Common.mapi (fun i _ -> i) in
let target_mappings =
x.targets
|> Common.map (fun (path : Fpath.t) : Input_to_core_t.target ->
{ path = !!path; language = x.xlang; rule_nums })
in
let wrapped_targets : Input_to_core_t.targets =
{ target_mappings; rule_ids }
(* The targets are mapped to rule_nums rather than rule_ids to improve the memory usage.
A list of rule_ids is passed with the targets to map back from num -> id. This means
that when creating the targets structure, the rules need to be numbered against the
final rule_ids list.
The rules need to be reversed to number them correctly because of how :: behaves
TODO after we delete pysemgrep, we can simplify this interface, which will also
improve memory usage again *)
let _, target_mappings, rules =
lang_jobs
|> List.fold_left
(fun (n, acc_mappings, acc_rules) lang_job ->
let num_rules, mappings, rule_ids =
target_mappings_of_lang_job lang_job n
in
( n + num_rules,
mappings :: acc_mappings,
List.rev rule_ids :: acc_rules ))
(0, [], [])
in
let config =
{
config with
target_source = Some (Targets wrapped_targets);
rule_source = Some (Rules x.rules);
}
let target_mappings = List.concat target_mappings in
let rules = rules |> List.rev |> List.concat in
let rule_ids =
Common.map (fun r -> fst r.Rule.id |> Rule.ID.to_string) rules
in
(* !!!!Finally! this is where we branch to semgrep-core!!! *)
Run_semgrep.semgrep_with_raw_results_and_exn_handler config
let targets : Input_to_core_t.targets = { target_mappings; rule_ids } in
{
config with
target_source = Some (Targets targets);
rule_source = Some (Rules rules);
}

(*************************************************************************)
(* Entry point *)
Expand Down Expand Up @@ -267,16 +244,22 @@ let invoke_semgrep_core ?(respect_git_ignore = true)
(fun { Lang_job.xlang; _ } ->
Metrics_.add_feature "language" (Xlang.to_string xlang))
lang_jobs;
(* TODO progress bar *)
let results_by_language =
lang_jobs
|> Common.map (fun lang_job ->
let _exn_optTODO, report, files =
semgrep_with_prepared_rules_and_targets config lang_job
in
(report, Set_.of_list files))
let config = prepare_config_for_semgrep_core config lang_jobs in

(* !!!!Finally! this is where we branch to semgrep-core!!! *)
let _exn_opt_TODO, res, files =
Run_semgrep.semgrep_with_raw_results_and_exn_handler config
in
let res, scanned = merge_results results_by_language in

let scanned = Set_.of_list files in

(* TODO(dinosaure): currently, we don't collect metrics when we invoke
semgrep-core but we should. However, if we implement a way to collect
metrics, we will just need to set [final_result.extra] to
[RP.Debug]/[RP.Time] and this line of code will not change. *)
Metrics_.add_max_memory_bytes (RP.debug_info_to_option res.extra);
Metrics_.add_targets scanned (RP.debug_info_to_option res.extra);

(* TODO: should get this from Run_semgrep *)
let _exnTODO = None in
(* similar to Run_semgrep.semgrep_with_rules_and_formatted_output *)
Expand Down

0 comments on commit c2a55bc

Please sign in to comment.