forked from open-chinese/alpaca-chinese-dataset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
35 lines (28 loc) · 1.05 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import json
import math
from os import listdir
from os.path import isfile, join
split_file_dir = './data'
split_file_path_template = './data/alpaca_chinese_part_{0}.json'
chunk_size = 1000
target_patterns = ('nooutput',)
def clean():
split_files = [join(split_file_dir, file_name) for file_name in listdir(split_file_dir)]
total_count = 0
for split_file in split_files[:]:
if not split_file.endswith('.json'):
continue
with open(split_file, 'r', encoding='utf-8') as rf:
items = json.load(rf)
for item in items:
en_output = item['en_output']
# zh_output = item['output']
for target_pattern in target_patterns:
if target_pattern in en_output.lower():
print(split_file, item['en_instruction'])
total_count += 1
break
# print(item)
print('batch clean done, found {0} samples'.format(total_count))
if __name__ == '__main__':
clean() # batch clean