forked from awsdocs/aws-doc-sdk-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanup_report.py
256 lines (225 loc) · 9.46 KB
/
cleanup_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
"""
Reads cleanup metadata and writes a report of files cleaned up vs. files still
needing cleanup. A cleaned file contains code that has been brought up to coding
standard, has been tested, and has at least minimal comments. To include a file
in the cleaned report, list it in a metadata.yaml file somewhere in the repo.
How to run a report
To run a full report over the entire repo, in the base folder of your repo,
run the following command.
python -m scripts.cleanup_report
This scans the full repository for files named 'metadata.yaml', digests them
into a list of cleaned files that are compared against existing code files,
and write a report to a file named 'report.csv' in the same folder.
You can also run the script against a subfolder and output the report to a custom
location, which can be useful for testing new metadata files.
python -m scripts.cleanup_report --root python/example_code/sqs --report ~/temp/sqs_rep.csv
"""
import argparse
import os
import sys
from urllib.parse import urljoin
import urllib.request as request
import yaml
from yaml.scanner import ScannerError
from yaml.parser import ParserError
METADATA_FILENAMES = ['metadata.yaml', '.metadata.yaml']
GITHUB_URL = 'https://github.com/awsdocs/aws-doc-sdk-examples/tree/master/'
# A file must have one of these extensions to be counted in file total.
EXT_LOOKUP = {
'c': 'C',
'cpp': 'C++',
'cs': 'C#',
'go': 'Go',
'html': 'JavaScript',
'java': 'Java',
'js': 'JavaScript',
'php': 'PHP',
'py': 'Python',
'rb': 'Ruby',
'ts': 'TypeScript'
}
IGNORE_FOLDERS = {
'venv',
'scripts',
'__pycache__',
'.pytest_cache',
'.vscode',
'vendor',
'node_modules',
}
def make_github_url(folder_name, file_name):
"""
Concatenate the GitHub base URL, a relative path to a folder, and a file name
to form a full URL to the file.
:param folder_name: The relative path to the folder that contains the file.
:param file_name: The name of the file.
:return: The full URL to the file on GitHub.
"""
folder_url = request.pathname2url(folder_name)
base_url = urljoin(GITHUB_URL, folder_url) + '/'
file_url = request.pathname2url(file_name)
file_url = urljoin(base_url, file_url)
return file_url
def gather_data(examples_folder):
"""
Scan a folder and its subfolders for metadata files and read them into a
list of example dictionaries. Also collect all files that have one of the
specified code extensions.
:param examples_folder: The root folder where the scan is started.
:return: A tuple of examples and file names.
"""
if not os.path.isdir(examples_folder):
raise FileNotFoundError(f"{examples_folder} is not a directory.")
examples = []
files = []
for folder_name, dirs, file_list in os.walk(examples_folder, topdown=True):
dirs[:] = [d for d in dirs if d not in IGNORE_FOLDERS]
for file_name in file_list:
ext = os.path.splitext(file_name)[1].lstrip('.')
if ext.lower() in EXT_LOOKUP:
file_url = make_github_url(folder_name, file_name)
files.append(file_url)
elif file_name.lower() in METADATA_FILENAMES:
file_path = os.path.join(folder_name, file_name)
print(f"Found metadata: {file_path}.")
read_metadata(file_path, examples)
return examples, files
def read_metadata(file_path, examples):
"""
Read the specified metadata file and append its contents into the specified list
of dictionaries.
:param file_path: That path to a metadata file that contains example metadata
in yaml format.
:param examples: A list of example dictionaries. Examples read from the metadata
are appended to this list.
"""
with open(file_path, 'r') as meta_stream:
try:
meta_docs = yaml.safe_load_all(meta_stream)
for example_meta in meta_docs:
if example_meta is None:
print(f"Empty section found in {file_path}.")
else:
example_meta['metadata_path'] = file_path
examples.append(example_meta)
except (ScannerError, ParserError) as err:
print(f"Yaml parser error in {file_path}, skipping.")
print(err)
def write_report(examples, repo_files, report_path=None, summarize=False, dirty=False):
"""
Writes a report of files cleaned versus files awaiting cleanup.
Files that are listed in metadata but do not exist in the repo are output
as missing files.
Files that are listed more than once in any metadata are output as duplicates.
The report includes the full list of example metadata in CSV format.
:param examples: A list of example dictionaries.
:param repo_files:
:param report_path: The output file to write the report. If this file exists,
it is overwritten. If no file is specified, the report
is written to sys.stdout.
:param summarize: Omit CSV output and only print the summary.
:param dirty: Include dirty files in the full report.
"""
lines = ["File,Language,Service"]
clean_files = []
missing_files = []
bad_examples = []
repo_files_lookup = {rf.lower() for rf in repo_files}
for example in examples:
try:
for file in example['files']:
metadata_folder = os.path.split(example['metadata_path'])[0]
file_url = make_github_url(metadata_folder, file['path'])
if file_url.lower() in repo_files_lookup:
if file_url not in clean_files:
clean_files.append(file_url)
ext = os.path.splitext(file_url)[1].lstrip('.')
language = EXT_LOOKUP[ext]
for service in file.get('services', ['']):
lines.append(
','.join([file_url, language, service]))
else:
print(f"File '{file_url}' reported a second time in "
f"{example['metadata_path']}.")
else:
missing_files.append(file_url)
print(
f"File '{file_url}' reported in metadata "
f"does not exist in the repo.")
except KeyError as error:
print(f"ERROR: example missing a required {error} key: {example}.")
bad_examples.append(example)
report = open(report_path, 'w') if report_path else sys.stdout
try:
clean_count = len(clean_files)
total_count = len(repo_files)
report.write(f"Total number of examples: "
f"{len(examples) - len(bad_examples)}.\n")
report.write(f"Total number of cleaned files: {clean_count}.\n")
report.write(f"Total number of files: {total_count}.\n")
if total_count > 0:
report.write(f"Percent clean: "
f"{clean_count/total_count:.0%}.")
if not summarize:
if len(lines) > 1:
report.write("\n")
report.write('\n'.join(lines))
if dirty:
clean_lookup = [file.lower() for file in clean_files]
dirty_files = sorted([file for file in repo_files_lookup
if file not in clean_lookup])
report.write("\n")
if dirty_files:
report.write("**Dirty files found:**\n")
report.write('\n'.join(dirty_files))
else:
report.write("**No dirty files found!**")
finally:
if report is not sys.stdout:
report.close()
print(f"Report written to {report_path}.")
def main():
parser = argparse.ArgumentParser(
description="Reads file metadata and writes a report of cleanup progress.")
parser.add_argument(
"--root",
default=".",
help="The folder to start the search for metadata files. Defaults to the "
"current folder."
)
parser.add_argument(
"--report",
help="The file path to write the report. When not specified, writes "
"to stdout."
)
parser.add_argument(
"--summarize",
action='store_true',
help="Omits full CSV report and outputs only a summary."
)
parser.add_argument(
"--dirty",
action='store_true',
help="Includes dirty files in the full report. This is most useful along with "
"--root, to verify that you've added all new files in a subfolder."
)
args = parser.parse_args()
try:
examples, files = gather_data(args.root)
write_report(examples, files, args.report, args.summarize, args.dirty)
except KeyError as error:
print(error)
if __name__ == '__main__':
main()