forked from cncf/gitdm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_aff_task.rb
executable file
·94 lines (84 loc) · 2.76 KB
/
gen_aff_task.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env ruby
require 'csv'
require 'json'
require 'pry'
if ARGV.size < 1
puts "Missing argument: unknowns.txt|alldevs.txt"
exit(1)
end
email2line = {}
File.readlines(ARGV[0]).each do |line|
line.strip!
ary = line.split "\t"
email = ary[1]
if email2line.key?(email)
puts "Duplicate email: #{line}"
else
email2line[email] = line
end
end
email2gh = {}
genders = {}
locations = {}
caffs = {}
gh = JSON.parse File.read 'github_users.json'
gh.each do |user|
email = user['email']
email2gh[email] = [] unless email2gh.key?(email)
email2gh[email] << "https://github.com/#{user['login'].downcase}"
genders[email] = user['sex']
locations[email] = user['location']
caffs[email] = user['affiliation']
end
email2gh.each do |email, logins|
email2gh[email] = logins.uniq
end
f = nf = 0
email2line.each do |email, line|
ary = line.split "\t"
name = ary[2]
email = ary[1]
ary2 = email.split '!'
uname = ary2[0]
dom = ary2[1]
escaped_name = URI.escape(name)
escaped_uname = URI.escape(name + ' ' + uname)
if !dom.nil? && dom.length > 0
ary3 = dom.split '.'
domain = ary3[0]
escaped_domain = URI.escape(name + ' ' + domain)
search = "https://www.linkedin.com/search/results/index/?keywords=#{escaped_name}\thttps://www.linkedin.com/search/results/index/?keywords=#{escaped_uname}\thttps://www.linkedin.com/search/results/index/?keywords=#{escaped_domain}"
else
search = "https://www.linkedin.com/search/results/index/?keywords=#{escaped_name}\thttps://www.linkedin.com/search/results/index/?keywords=#{escaped_uname}\t-"
end
gender = genders[email]
gender = '' if gender.nil?
location = locations[email]
location = '' if location.nil?
caff = caffs[email]
caff = '' if caff.nil? || caff == 'NotFound' || caff == '(Unknown)'
if email2gh.key?(email)
logins = email2gh[email]
email2line[email] = "#{line}\t#{logins.join(',')}\t#{search}\t#{gender}\t#{location}\t#{caff}"
f += 1
else
email2line[email] = "#{line}\t-\t#{search}\t#{gender}\t#{location}\t#{caff}"
nf += 1
end
end
puts "Found #{f}, not found #{nf}"
onlygh = !ENV['ONLY_GH'].nil?
onlyemp = !ENV['ONLY_EMP'].nil?
arr = []
email2line.each { |email, line| arr << line.split("\t") }
arr = arr.sort_by { |item| [-item[3].to_i] }
hdr = %w(type email name github linkedin1 linkedin2 linkedin3 patches gender location affiliations)
CSV.open(ARGV[0].split('.')[0...-1].join('.')+'.csv', 'w', headers: hdr) do |csv|
csv << hdr
arr.each do |ary|
next if onlygh && (ary[4] == '' || ary[4] == '-' || ary[4].nil?)
next if onlyemp && ary[10] != '' && !ary[10].nil?
puts "#{ary[0]}/#{ary[1]}: #{ary[4]} --- #{ary[8]} --- #{ary[9]} --- #{ary[10]}"
csv << [ary[0], ary[1], ary[2], ary[4], ary[5], ary[6], ary[7], ary[3], ary[8], ary[9], ary[10]]
end
end