Skip to content

Commit

Permalink
osemgrep: Port autofix application for fix (semgrep#6398)
Browse files Browse the repository at this point in the history
This isn't an exact port of the relevant code in `autofix.py`. I already
had some autofix application code in `Autofix.ml` for testing purposes.
In semgrep#6391 I refactored it to make it usable in this context.

Several differences:
- Instead of tracking details about applied fixes so that we can
  correctly apply multiple fixes to the same file, we just apply them
  from bottom to top so that they can't interfere with each other.
- We discard overlapping fixes and apply the ones that we can apply
  without conflicts. I think the Python CLI just applies them anyway,
  possibly incorrectly. However, the Python CLI also dedups matches that
  are the same except for metavariable bindings, which eliminates a
  major source of overlapping fixes. This logic is not ported yet.
- Here, we apply fixes using the byte offset into the file, instead of
  reading the file, splitting into lines, and then updating using the
  line and column like the CLI does. This is a lot simpler.
- When the `--dryrun` flag is passed, the Python CLI reports a list of
  fixed lines in its JSON output. This hasn't been ported yet, and in
  fact using the byte offset (see above) will make this a little bit
  more complicated.

Test plan:

Manual testing with the example given in semgrep#4964:

`osemgrep scan -c autofix.yaml autofix.py --autofix`

Note that this is an example where there are multiple matches reported
for the same range, due to different possible metavariable bindings. For
overlapping ranges, osemgrep applies the first one in the list that
semgrep-core returns, just like the Python CLI. So, in this example it
applies the same fix as the CLI.

Also, ran the e2e tests:

`PYTEST_USE_OSEMGREP=true pipenv run python -m pytest tests/e2e`

Before: 77 passed

After: 83 passed
  • Loading branch information
nmote authored Oct 26, 2022
1 parent 91433aa commit 338b35e
Show file tree
Hide file tree
Showing 10 changed files with 120 additions and 59 deletions.
44 changes: 43 additions & 1 deletion semgrep-core/src/fixing/Autofix.ml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ let validate_fix lang text =
(Exception.to_string e))

type textedit = {
path : string;
path : Common.filename;
(* 0-based byte index, inclusive *)
start : int;
(* 0-based byte index, exclusive *)
Expand Down Expand Up @@ -94,6 +94,28 @@ let apply_edits_to_text text edits =
if discarded_edits = [] then Success fixed_text
else Overlap { partial_result = fixed_text; discarded_edits }

let partition_edits_by_file edits =
(* TODO Consider using Common.group_by if we update it to return edits in
* order and add a comment specifying that changes to it must maintain that
* behavior. *)
let edits_by_file = Hashtbl.create 8 in
List.iter
(fun edit ->
let prev =
match Hashtbl.find_opt edits_by_file edit.path with
| Some lst -> lst
| None -> []
in
Hashtbl.replace edits_by_file edit.path (edit :: prev))
edits;
(* Restore the original order of the edits as they appeared in the input list.
* This is important so that we consistently choose to apply the first edit in
* the original input when there are edits for an identical span. *)
Hashtbl.filter_map_inplace
(fun _file edits -> Some (List.rev edits))
edits_by_file;
edits_by_file

(******************************************************************************)
(* Entry Points *)
(******************************************************************************)
Expand Down Expand Up @@ -157,6 +179,26 @@ let render_fix lang metavars ~fix_pattern ~target_contents =
|> List.iter (fun line -> logger#info "%s" line);
None

let apply_edits ~dryrun edits =
let edits_by_file = partition_edits_by_file edits in
let all_discarded_edits = ref [] in
Hashtbl.iter
(fun file file_edits ->
let file_text = Common.read_file file in
let new_text =
match apply_edits_to_text file_text file_edits with
| Success x -> x
| Overlap { partial_result; discarded_edits } ->
Common.push discarded_edits all_discarded_edits;
partial_result
in
(* TOPORT: when dryrun, report fixed lines *)
if not dryrun then Common.write_file ~file new_text)
edits_by_file;
let modified_files = Hashtbl.to_seq_keys edits_by_file |> List.of_seq in
let discarded_edits = List.concat !all_discarded_edits in
(modified_files, discarded_edits)

(* Apply the fix for the list of matches to the given file, returning the
* resulting file contents. Currently used only for tests, but with some changes
* could be used in production as well. *)
Expand Down
16 changes: 16 additions & 0 deletions semgrep-core/src/fixing/Autofix.mli
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,22 @@ val render_fix :
target_contents:string Lazy.t ->
string option

type textedit = {
path : string;
(* 0-based byte index, inclusive *)
start : int;
(* 0-based byte index, exclusive *)
end_ : int;
replacement_text : string;
}

(* Apply a list of edits, modifying the files in place. If dryrun, do everything
* but write to the files.
*
* Returns the list of modified files and the list of edits that were not
* applied because they overlapped with others. *)
val apply_edits : dryrun:bool -> textedit list -> string list * textedit list

(* Apply the fix for the list of matches to the given file, returning the
* resulting file contents. Currently used only for tests, but with some changes
* could be used in production as well. *)
Expand Down
30 changes: 0 additions & 30 deletions semgrep-core/src/osemgrep/TOPORT/autofix.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,36 +64,6 @@ def _get_match_context(
return start_line, start_col, end_line, end_col


def _basic_fix(
rule_match: RuleMatch, file_offsets: FileOffsets, fix: str
) -> Tuple[Fix, FileOffsets]:

p = rule_match.path
lines = _get_lines(p)

# get the start and end points
start_line, start_col, end_line, end_col = _get_match_context(
rule_match, file_offsets
)

# break into before, to modify, after
before_lines = lines[:start_line]
before_on_start_line = lines[start_line][:start_col]
after_on_end_line = lines[end_line][end_col:] # next char after end of match
modified_lines = (before_on_start_line + fix + after_on_end_line).splitlines()
after_lines = lines[end_line + 1 :] # next line after end of match
contents_after_fix = before_lines + modified_lines + after_lines

# update offsets
file_offsets.line_offset = (
len(contents_after_fix) - len(lines) + file_offsets.line_offset
)
if start_line == end_line:
file_offsets.col_offset = len(fix) - (end_col - start_col)

return Fix(SPLIT_CHAR.join(contents_after_fix), modified_lines), file_offsets


def _regex_replace(
rule_match: RuleMatch,
file_offsets: FileOffsets,
Expand Down
10 changes: 0 additions & 10 deletions semgrep-core/src/osemgrep/TOPORT/commands/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,16 +271,6 @@ def scan_options(func: Callable) -> Callable:

@optgroup.group("Configuration options", cls=MutuallyExclusiveOptionGroup)

@click.option(
"--dryrun/--no-dryrun",
is_flag=True,
default=False,
help="""
If --dryrun, does not write autofixes to a file. This will print the changes
to the console. This lets you see the changes before you commit to them. Only
works with the --autofix flag. Otherwise does nothing.
""",
)
@click.option(
"--severity",
multiple=True,
Expand Down
9 changes: 0 additions & 9 deletions semgrep-core/src/osemgrep/TOPORT/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,6 @@ def flatten(some_list: List[List[T]]) -> List[T]:
PathFilterCallable = Callable[..., FrozenSet[Path]]


def unit_str(count: int, unit: str, pad: bool = False) -> str:
if count != 1:
unit += "s"
elif pad:
unit += " "

return f"{count} {unit}"


def git_check_output(command: Sequence[str], cwd: Optional[str] = None) -> str:
"""
Helper function to run a GIT command that prints out helpful debugging information
Expand Down
1 change: 1 addition & 0 deletions semgrep-core/src/osemgrep/cli_scan/Core_runner.ml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ let runner_config_of_conf (conf : Scan_CLI.conf) : Runner_config.t =
(* TOPORT: not handled yet *)
logging_level = _;
autofix = _;
dryrun = _;
baseline_commit = _;
exclude = _;
include_ = _;
Expand Down
33 changes: 33 additions & 0 deletions semgrep-core/src/osemgrep/cli_scan/Output.ml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ module Out = Semgrep_output_v0_j
TODO? move most of the content of this file to Output_JSON.ml?
*)

let apply_fixes (conf : Scan_CLI.conf) (cli_output : Out.cli_output) =
(* TODO fix_regex *)
let edits : Autofix.textedit list =
List.filter_map
(fun (result : Out.cli_match) ->
let path = result.Out.path in
let* fix = result.Out.extra.fix in
let start = result.Out.start.offset in
let end_ = result.Out.end_.offset in
Some Autofix.{ path; start; end_; replacement_text = fix })
cli_output.results
in
Autofix.apply_edits ~dryrun:conf.dryrun edits

(*****************************************************************************)
(* Entry point *)
(*****************************************************************************)
Expand All @@ -21,6 +35,25 @@ let output_result (conf : Scan_CLI.conf) (res : Core_runner.result) : unit =
let cli_output : Out.cli_output =
Cli_json_output.cli_output_of_core_results conf res
in
(if conf.autofix then
(* TODO report when we fail to apply a fix because it overlaps with another?
* Currently it looks like the Python CLI will just blindly apply
* overlapping fixes, probably breaking code.
*
* At some point we could re-run Semgrep on all files where some fixes
* haven't been applied because they overlap with other fixes, and repeat
* until all are applied (with some bounds to prevent divergence). This
* probably happens rarely enough that it would be very fast, and it would
* make the autofix experience better.
* *)
let modified_files, _failed_fixes = apply_fixes conf cli_output in
if not conf.dryrun then
if modified_files <> [] then
pr2
(spf "successfully modified %s."
(String_utils.unit_str (List.length modified_files) "file"))
else pr2 "no files modified.");

(* TOPORT? Sort keys for predictable output. Helps with snapshot tests *)
match conf.output_format with
| Json ->
Expand Down
30 changes: 21 additions & 9 deletions semgrep-core/src/osemgrep/cli_scan/Scan_CLI.ml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ module H = Cmdliner_helpers
*)
type conf = {
autofix : bool;
dryrun : bool;
baseline_commit : string option;
(* TOPORT: can have multiple calls to --config, so string list here *)
config : string;
Expand Down Expand Up @@ -55,6 +56,7 @@ let get_cpu_count () : int =
let default : conf =
{
autofix = false;
dryrun = false;
baseline_commit = None;
config = "auto";
exclude = [];
Expand Down Expand Up @@ -101,6 +103,15 @@ Make sure your files are stored in a version control system. Note that
this mode is experimental and not guaranteed to function properly.
|}

let o_dryrun : bool Term.t =
H.negatable_flag [ "dryrun" ] ~neg_options:[ "no-dryrun" ]
~default:default.dryrun
~doc:
{| If --dryrun, does not write autofixes to a file. This will print the changes
to the console. This lets you see the changes before you commit to them. Only
works with the --autofix flag. Otherwise does nothing.
|}

let o_baseline_commit : string option Term.t =
let info =
Arg.info [ "baseline_commit" ]
Expand Down Expand Up @@ -390,10 +401,10 @@ let o_target_roots =
(*****************************************************************************)

let cmdline_term : conf Term.t =
let combine autofix baseline_commit config debug emacs exclude include_ json
lang max_memory_mb max_target_bytes metrics num_jobs optimizations pattern
quiet respect_git_ignore strict target_roots timeout timeout_threshold
verbose vim =
let combine autofix dryrun baseline_commit config debug emacs exclude include_
json lang max_memory_mb max_target_bytes metrics num_jobs optimizations
pattern quiet respect_git_ignore strict target_roots timeout
timeout_threshold verbose vim =
let output_format =
match (json, emacs, vim) with
| false, false, false -> default.output_format
Expand All @@ -414,6 +425,7 @@ let cmdline_term : conf Term.t =
in
{
autofix;
dryrun;
baseline_commit;
config;
exclude;
Expand All @@ -436,11 +448,11 @@ let cmdline_term : conf Term.t =
in
(* Term defines 'const' but also the '$' operator *)
Term.(
const combine $ o_autofix $ o_baseline_commit $ o_config $ o_debug $ o_emacs
$ o_exclude $ o_include $ o_json $ o_lang $ o_max_memory_mb
$ o_max_target_bytes $ o_metrics $ o_num_jobs $ o_optimizations $ o_pattern
$ o_quiet $ o_respect_git_ignore $ o_strict $ o_target_roots $ o_timeout
$ o_timeout_threshold $ o_verbose $ o_vim)
const combine $ o_autofix $ o_dryrun $ o_baseline_commit $ o_config
$ o_debug $ o_emacs $ o_exclude $ o_include $ o_json $ o_lang
$ o_max_memory_mb $ o_max_target_bytes $ o_metrics $ o_num_jobs
$ o_optimizations $ o_pattern $ o_quiet $ o_respect_git_ignore $ o_strict
$ o_target_roots $ o_timeout $ o_timeout_threshold $ o_verbose $ o_vim)

let doc = "run semgrep rules on files"

Expand Down
1 change: 1 addition & 0 deletions semgrep-core/src/osemgrep/cli_scan/Scan_CLI.mli
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*)
type conf = {
autofix : bool;
dryrun : bool;
baseline_commit : string option;
config : string;
exclude : string list;
Expand Down
5 changes: 5 additions & 0 deletions semgrep-core/src/utils/String_utils.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
open Common

let unit_str ?(pad = false) count str =
let str = if count <> 1 then str ^ "s" else if pad then str ^ " " else str in
spf "%d %s" count str

0 comments on commit 338b35e

Please sign in to comment.