-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathnp_spider.py
196 lines (163 loc) · 6.54 KB
/
np_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import re
import json
import os
from scrapy.spiders import Rule, CrawlSpider
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from w3lib.html import remove_tags
from crawler.utils import APIItem, nice_dump, process_code_info
class NumpySpider(CrawlSpider):
name = "numpy"
version = "1.18"
allowed_domains = ['numpy.org']
start_urls = [f'https://numpy.org/doc/{version}/genindex.html']
split_def = re.compile(r'^([\w\.]+)\(([\w\,\s=\*\.]*)\)')
rules = (
Rule(LinkExtractor(
allow=(re.compile(r'.+\.html')),
restrict_xpaths='//li'),
callback='parse_api', ),
)
def parse_api(self, response):
# Initiating item struct
item = APIItem()
item['library'] = 'numpy'
self.logger.info(f'Scraping {response.url}')
item_id = 'None'
code = 'None'
description = 'None'
returns = 'None'
examples = []
dt = response.css('dt')
if dt:
item_id = dt.attrib['id']
code = remove_tags(dt.get())
description = response.css('dd')
if description:
description = remove_tags(description.get())
params_tr = response.xpath('//tr[contains(text(), "Parameters")]')
parameters = []
if params_tr:
for p in params_tr.css('dd').getall():
if "Parameters" not in p:
parameters.append(remove_tags(p).replace('\n', ''))
if not params_tr:
list_of_items = response.css('dl.field-list')
is_param_list = list_of_items.xpath('//dt[contains(text(), "Parameters")]').get()
if is_param_list:
for p in list_of_items.css('dt').getall():
if "Parameters" not in p:
parameters.append(remove_tags(p))
return_tr = response.xpath('//tr[contains(text(), "Returns")]')
if return_tr:
returns = return_tr.css('dd').get()
if returns:
returns = remove_tags(returns)
if not return_tr:
returns = response.xpath('//dt[.="Returns"]/following-sibling::dd[1]')
if returns:
returns = remove_tags(returns.get()).replace('\n', '')
examples = []
example_p = response.xpath('//p[contains(text(), "Examples")]/following::div').getall()
if example_p:
for e in example_p:
example = remove_tags(e)
if '>>>' in example:
examples.append(example.replace('>>>', ''))
item['item_id'] = item_id
item['code'] = code
item['description'] = description
item['parameters'] = parameters
item['returns'] = returns
item['examples'] = examples
yield item
def preprocess_torch_data(raw_data_file):
# load the raw data
data = None
with open(raw_data_file) as f:
data = json.load(f)
processed_data = []
for item in data:
# TODO: find better ways to exclude non-functions
if '(' not in item['code']:
continue
processed_item = dict()
# unify the notation for the code
raw_code = item['code']
code = item['item_id'] + raw_code[raw_code.find('('):raw_code.find(')') + 1]
processed_item['code'] = code
# extract the summary
description = item['description']
# if 'Parameters' in description:
# summary = description.split('Parameters')[0]
# else:
# summary = description.split('.')[0]
summary = description.split('. ')[0]
processed_item['item_id'] = item['item_id']
processed_item['summary'] = summary
processed_item['description'] = ''
processed_item['example'] = item['examples']
processed_item['returns'] = item['returns']
processed_item['code-info'] = process_code_info(processed_item['code'])
# add description to all the arguments
arg_json: list = processed_item['code-info']['parameters']
arg_names = list(map(lambda arg: arg['name'], arg_json))
matching_result = dict()
for i in range(len(arg_names)):
arg_name = arg_names[i]
start_mark = '\n' + arg_name + ' ('
if i != len(arg_names) - 1:
end_mark = '\n' + arg_names[i + 1] + ' ('
else:
end_mark = '\n\n'
if not (start_mark in description and end_mark in description):
continue
matching_result[arg_name] = '(' + description.split(start_mark)[1].split(end_mark)[0]
for arg_dict in arg_json:
name = arg_dict['name']
if name in matching_result:
arg_dict['description'] = matching_result[name]
else:
arg_dict['description'] = ''
# augment the types of arguments with NL description
for arg in arg_json:
if arg['type'] == '':
# TODO: figure out why it fails sometimes
try:
description_types = arg['description'].split('(')[1].split(')')[0]
if 'int' in description_types:
arg['type'] = 'int'
elif 'float' in description_types:
arg['type'] = 'float'
elif 'bool' in description_types:
arg['type'] = 'bool'
elif 'Tensor' in description_types:
arg['type'] = 'tensor'
elif 'string' in description_types:
arg['type'] = 'string'
else:
arg['type'] = 'others'
except IndexError:
arg['type'] = 'others'
processed_data.append(processed_item)
preprocessed_json_file_name = 'preprocessed_' + raw_data_file
if os.path.exists(preprocessed_json_file_name):
os.remove(preprocessed_json_file_name)
nice_dump(preprocessed_json_file_name, processed_data)
if __name__ == '__main__':
json_file_name = 'numpy_docs.json'
# '''
if os.path.exists(json_file_name):
os.remove(json_file_name)
spider = NumpySpider()
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'FEED_FORMAT': 'json',
'FEED_URI': json_file_name
})
process.crawl(NumpySpider)
process.start()
process.join()
print("crawling completes, starts preprocessing...")
# '''
preprocess_torch_data(json_file_name)