chore(osemgrep): send all rules and targets together to Run_semgrep (s…

…emgrep#8197) The run was previously broken up by language, but this is ugly and unnecessary. I'm refactoring this ahead of changes to implement osemgrep-pro, which will be easier if this is one command. (I could also have wrapped it in a function but it wasn't too hard to fix.) Test plan: make test PR checklist: - [x] Purpose of the code is [evident to future readers](https://semgrep.dev/docs/contributing/contributing-code/#explaining-code) - [x] Tests included or PR comment includes a reproducible test plan - [x] Documentation is up-to-date - [x] A changelog entry was [added to changelog.d](https://semgrep.dev/docs/contributing/contributing-code/#adding-a-changelog-entry) for any user-facing change - [x] Change has no security implications (otherwise, ping security team) If you're unsure about any of this, please see: - [Contribution guidelines](https://semgrep.dev/docs/contributing/contributing-code)! - [One of the more specific guides located here](https://semgrep.dev/docs/contributing/contributing/)
rkialashaki · Jul 10, 2023 · c2a55bc · c2a55bc
1 parent 521eba0
commit c2a55bc
Showing 1 changed file with 62 additions and 79 deletions.
diff --git a/src/osemgrep/cli_scan/Core_runner.ml b/src/osemgrep/cli_scan/Core_runner.ml
@@ -57,41 +57,6 @@ type result = {
 (* Helpers *)
 (*************************************************************************)
 
-(* TODO: should return also exn opt,
- * like the return type of semgrep_with_raw_results_and_exn_handler.
- * TODO: we should not even need this function because
- * Run_semgrep.semgrep_with_raw_results_and_exn_handler can already
- * handle a list of targets in different language and so no need to
- * merge results in the first place.
- *
- * TODO? do we have utility functions like that already in Report.mli?
- * should move it there? or should not need it at all, see TODO above?
- *)
-let merge_results (xresults : (Report.final_result * Fpath.t Set_.t) list) :
-    Report.final_result * Fpath.t Set_.t =
-  let results = xresults |> Common.map fst in
-  let files =
-    xresults |> Common.map snd |> List.fold_left Set_.union Set_.empty
-  in
-  let final_result =
-    {
-      RP.matches = List.concat_map (fun x -> x.RP.matches) results;
-      errors = List.concat_map (fun x -> x.RP.errors) results;
-      skipped_rules = List.concat_map (fun x -> x.RP.skipped_rules) results;
-      extra = No_info;
-      explanations = List.concat_map (fun x -> x.RP.explanations) results;
-      rules_by_engine =
-        List.concat_map (fun x -> x.Report.rules_by_engine) results;
-    }
-  in
-  (* TODO(dinosaure): currently, we don't collect metrics when we invoke
-     semgrep-core but we should. However, if we implement a way to collect
-     metrics, we will just need to set [final_result.extra] to
-     [RP.Debug]/[RP.Time] and this line of code will not change. *)
-  Metrics_.add_max_memory_bytes (RP.debug_info_to_option final_result.extra);
-  Metrics_.add_targets files (RP.debug_info_to_option final_result.extra);
-  (final_result, files)
-
 (* The same rule may appear under multiple target languages because
    some patterns can be interpreted in multiple languages.
    TODO? could use Common.group_by
@@ -157,44 +122,56 @@ let runner_config_of_conf (conf : conf) : Runner_config.t =
         version = Version.version;
       }
 
-(* Similar to semgrep_with_raw_results_and_exn_handler but takes rules
-   and targets already filtered for a specific language.
-   All other options are read from the runner_config object.
-   TODO: we should not need this function because
-   semgrep_with_raw_results_and_exn_handler can take a list of targets
-   in different language (via config.targets set via --targets)
-   This is ugly, with potentially some filtering operations being done twice.
-   It should get simplified when we get rid of the pysemgrep completely.
-*)
-let semgrep_with_prepared_rules_and_targets (config : Runner_config.t)
-    (x : Lang_job.t) : Exception.t option * Report.final_result * Fpath.t list =
-  (* compute the rule idx and rule_nums for target_mappings
-   * (see Input_to_core.atd)
-   *)
-  let rule_ids =
-    x.rules
-    |> Common.map (fun (x : Rule.t) ->
-           let id, _tok = x.id in
-           (id :> string))
+let prepare_config_for_semgrep_core (config : Runner_config.t)
+    (lang_jobs : Lang_job.t list) =
+  let target_mappings_of_lang_job (x : Lang_job.t) prev_rule_count :
+      int * Input_to_core_t.target list * Rule.rules =
+    let rule_ids =
+      x.rules
+      |> Common.map (fun (x : Rule.t) ->
+             let id, _tok = x.id in
+             (id :> string))
+    in
+    let rule_nums = rule_ids |> Common.mapi (fun i _ -> i + prev_rule_count) in
+    let target_mappings =
+      x.targets
+      |> Common.map (fun (path : Fpath.t) : Input_to_core_t.target ->
+             { path = !!path; language = x.xlang; rule_nums })
+    in
+    (List.length rule_ids, target_mappings, x.rules)
   in
-  let rule_nums = rule_ids |> Common.mapi (fun i _ -> i) in
-  let target_mappings =
-    x.targets
-    |> Common.map (fun (path : Fpath.t) : Input_to_core_t.target ->
-           { path = !!path; language = x.xlang; rule_nums })
-  in
-  let wrapped_targets : Input_to_core_t.targets =
-    { target_mappings; rule_ids }
+  (* The targets are mapped to rule_nums rather than rule_ids to improve the memory usage.
+     A list of rule_ids is passed with the targets to map back from num -> id. This means
+     that when creating the targets structure, the rules need to be numbered against the
+     final rule_ids list.
+
+     The rules need to be reversed to number them correctly because of how :: behaves
+
+     TODO after we delete pysemgrep, we can simplify this interface, which will also
+     improve memory usage again *)
+  let _, target_mappings, rules =
+    lang_jobs
+    |> List.fold_left
+         (fun (n, acc_mappings, acc_rules) lang_job ->
+           let num_rules, mappings, rule_ids =
+             target_mappings_of_lang_job lang_job n
+           in
+           ( n + num_rules,
+             mappings :: acc_mappings,
+             List.rev rule_ids :: acc_rules ))
+         (0, [], [])
   in
-  let config =
-    {
-      config with
-      target_source = Some (Targets wrapped_targets);
-      rule_source = Some (Rules x.rules);
-    }
+  let target_mappings = List.concat target_mappings in
+  let rules = rules |> List.rev |> List.concat in
+  let rule_ids =
+    Common.map (fun r -> fst r.Rule.id |> Rule.ID.to_string) rules
   in
-  (* !!!!Finally! this is where we branch to semgrep-core!!! *)
-  Run_semgrep.semgrep_with_raw_results_and_exn_handler config
+  let targets : Input_to_core_t.targets = { target_mappings; rule_ids } in
+  {
+    config with
+    target_source = Some (Targets targets);
+    rule_source = Some (Rules rules);
+  }
 
 (*************************************************************************)
 (* Entry point *)
@@ -267,16 +244,22 @@ let invoke_semgrep_core ?(respect_git_ignore = true)
         (fun { Lang_job.xlang; _ } ->
           Metrics_.add_feature "language" (Xlang.to_string xlang))
         lang_jobs;
-      (* TODO progress bar *)
-      let results_by_language =
-        lang_jobs
-        |> Common.map (fun lang_job ->
-               let _exn_optTODO, report, files =
-                 semgrep_with_prepared_rules_and_targets config lang_job
-               in
-               (report, Set_.of_list files))
+      let config = prepare_config_for_semgrep_core config lang_jobs in
+
+      (* !!!!Finally! this is where we branch to semgrep-core!!! *)
+      let _exn_opt_TODO, res, files =
+        Run_semgrep.semgrep_with_raw_results_and_exn_handler config
       in
-      let res, scanned = merge_results results_by_language in
+
+      let scanned = Set_.of_list files in
+
+      (* TODO(dinosaure): currently, we don't collect metrics when we invoke
+         semgrep-core but we should. However, if we implement a way to collect
+         metrics, we will just need to set [final_result.extra] to
+         [RP.Debug]/[RP.Time] and this line of code will not change. *)
+      Metrics_.add_max_memory_bytes (RP.debug_info_to_option res.extra);
+      Metrics_.add_targets scanned (RP.debug_info_to_option res.extra);
+
       (* TODO: should get this from Run_semgrep *)
       let _exnTODO = None in
       (* similar to Run_semgrep.semgrep_with_rules_and_formatted_output *)