forked from justinwang1/genderbias
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.R
192 lines (148 loc) · 7.5 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#Basic Setup
library(dummies)
library(MASS)
library(randomForest)
library(quantreg)
library(Qtools)
#Clean the data first - make sure data file is in same directory as cleandata.R
source("cleandata.R")
#Data Setup
simplenames <- c('pubyr','cite','name','phdyr','job','country','gender') #Give the columns easier shorthand names
gb <- data
colnames(gb) <- simplenames
gb$gender <- ifelse(gb$gender=="Male",0,1)
gb.m <- gb[which(gb$gender == 0),]; gb.f <- gb[which(gb$gender == 1),]
##Create dummies for the categorical variables
#gbfull is the original data gb but with all of the categorical variables turned into dummies
job.dummies <- dummy(gb$job,sep="_")
country.dummies <- dummy(gb$country,sep="_")
gbfull <- cbind(gb[,c("pubyr","cite","phdyr","gender")], job.dummies[,-6],country.dummies[,-5] )
#Naive wilcox test and boxplot
cite.m <- gbfull$cite[which(gb$gender==0)]; cite.f <- gbfull$cite[which(gb$gender== 1)]
wilcox.test(cite.m,cite.f)
boxplot(cite.m,cite.f,names=c("Male","Female"))
#Conditional t tests
#Variable "jobinfos" stores a table containing conditional t and Wilcox tests of cites between male and female,
#given each possible job title
#Nmale and Nfemale are the sample sizes of male and female in these conditional groups, respectively
alljobs <- c("Professor_Emeretis","Distinguished_Professor","Professor",
"Associate_Professor","Assistant_Professor","Other_Academia","Industry")
jobinfos <- matrix(0,length(alljobs),6)
colnames(jobinfos) <- c("Mean Cites","p-val (Wcx)","p-val (t)","t stat","Nmale","Nfemale")
rownames(jobinfos) <- alljobs
for(i in 1:length(alljobs)) {
myset <- alljobs[i]
jobindx.m <- which(gb.m$job %in% myset ); jobindx.f <- which(gb.f$job %in% myset )
num.m <- length(jobindx.m); num.f <- length(jobindx.f)
cc.m <- gb.m$cite[jobindx.m]; cc.f <- gb.f$cite[jobindx.f]
avg.cites <- mean(c(cc.m,cc.f))
wiltest <- wilcox.test(cc.m,cc.f); ttest <- t.test(cc.m,cc.f)
pval.wil <- wiltest$p.value; pval.t <- ttest$p.value; stat <- unname(ttest$statistic)
jobinfos[i,] <- round(c(avg.cites,pval.wil,pval.t,stat,num.m,num.f),3)
}
print(jobinfos)
#Similarly for the countries
allcountries <- c("United_States","Canada","United_Kingdom","Australia","Foreign")
countryinfos <- matrix(0,length(allcountries),6)
colnames(countryinfos) <- c("Mean Cites","p-val (Wcx)","p-val (t)","t stat","Nmale","Nfemale")
rownames(countryinfos) <- allcountries
for(i in 1:length(allcountries)) {
myset <- allcountries[i]
countryindx.m <- which(gb.m$country %in% myset ); countryindx.f <- which(gb.f$country %in% myset )
num.m <- length(countryindx.m); num.f <- length(countryindx.f)
cc.m <- gb.m$cite[countryindx.m]; cc.f <- gb.f$cite[countryindx.f]
avg.cites <- mean(c(cc.m,cc.f))
wiltest <- wilcox.test(cc.m,cc.f); ttest <- t.test(cc.m,cc.f)
pval.wil <- wiltest$p.value; pval.t <- ttest$p.value; stat <- unname(ttest$statistic)
countryinfos[i,] <- round(c(avg.cites,pval.wil,pval.t,stat,num.m,num.f),3)
}
print(countryinfos)
#Prints out the most prolific papers in the dataset with First Author's name
gb[order(gb$cite,decreasing=TRUE),]
#Correlations and scatterplots of the quantitative variables
subdata <- gb[,c("pubyr","cite","phdyr","gender")]
cor(subdata)
pairs(subdata)
#Boxplot of cites against job title
boxplot(gb$cite~gb$job)
summary(aov(gb$cite~gb$job))
#Boxplot of cites against country
boxplot(gb$cite~gb$country)
summary(aov(gb$cite~gb$country))
##Adding some interactions to the data.
#In particular I added all of the job title x gender and country x gender interactions as the most relevant ones
gender <- gbfull$gender
job.gender.interactions <- job.dummies[,-6]
for(i in 1:dim(job.gender.interactions)[2]) { job.gender.interactions[,i] <- job.gender.interactions[,i]*gender }
colnames(job.gender.interactions) <- c("Ass.Prof_x_Gender","Assoc.Prof_x_Gender","D.Prof_x_Gender","Industry_x_Gender",
"OA_x_Gender","PE_x_Gender")
country.gender.interactions <- country.dummies[,-5]
for(i in 1:dim(country.gender.interactions)[2]) {
country.gender.interactions[,i] <- country.gender.interactions[,i]*gender }
colnames(country.gender.interactions) <- c("AU_x_Gender","CA_x_Gender","Foreign_x_Gender","UK_x_Gender")
gbinter <- cbind(gbfull,job.gender.interactions)
##Linear Regression Model - first attempt at modeling
lin.model <- lm(cite~.,data=gbfull)
summary(lin.model)
#Some quick diagnostics for Linear Regression
#There are some problems with normality of residuals and heteroskedasticity
qqnorm(resid(lin.model)); qqline(resid(lin.model))
hist(resid(lin.model))
plot(predict(lin.model),resid(lin.model)); abline(h=0,col=2,lty=2)
##Poisson Regression Model
#While Poisson Regression makes intuitive sense as the response are count variables, ultimately the
#dispersion is absurdly high and the residual deviance is abnormally large. This is most likely due to
#the large variance within the citation counts (response).
pois.model <- glm(cite~.,data=gbfull,family="poisson")
summary(pois.model)
disp.test.model <- glm(cite~.,data=gbfull,family=quasipoisson(link=log))
summary(disp.test.model)
##Negative Binomial Regression Model
#An attempt to fix the overdispersion problem from before. Fit is somewhat better, but still poor.
#The issues are similar.
model.nb <- glm.nb(cite~.,data=gbfull)
summary(model.nb)
pchisq(model.nb$deviance,model.nb$df.residual,lower.tail=FALSE)
pchisq(2 * (logLik(model.nb) - logLik(pois.model)), df = 1, lower.tail = FALSE)
##Generalized Additive Model
#Not very interpretable, and lack of fit problem does not go away
require(gam)
gam1 <- gam(cite~s(pubyr)+s(phdyr)+job+country+gender,data=gb)
summary(gam1)
pchisq(gam1$deviance,gam1$df.residual,lower.tail=FALSE)
##Linear Model take two
#Try it again since the GLMs and GAM are not fitting the data well. Since the counts are highly varied, linear
#makes some level of sense.
#Box-Cox: select optimal lambda for applying transformation y* = y^(lambda)
gbfull1 <- gbfull; cites <- gbfull1$cite; cites[which(cites==0)] <- 1; gbfull1$cite <- cites
model <- lm(cite~.,data=gbfull1)
boxcox(model)
#log transformation.
newcite <- log(gb$cite)
newcite[which(newcite < 0)] <- 0 #log 0 = undefined, so changed 0 citations -> 1 citation
gbfull$cite <- newcite
lin.model <- lm(cite~.,data=gbfull)
summary(lin.model)
#Check assumptions again - improvement. Normality assumption improved a lot, but is still a bit off
qqnorm(resid(lin.model)); qqline(resid(lin.model))
hist(resid(lin.model))
plot(predict(lin.model),resid(lin.model)); abline(h=0,col=2,lty=2)
##Quantile regression
#Bypasses the normality of residuals assumption - does not need it
#Predict the median rather than E[Y|X]. Relevant here since there are quite a few "high-cite" papers that
#skew the results
qr.model <- rq(cite~.,data=gbfull1)
GOFTest(qr.model) #Is the model fit good?
##Prediction-based confidence intervals
#Fit model to Male data only, and then predict female data using Male Model.
#Then construct confidence interval of predictions - Actual Female Citations.
#Used both Linear Model and Random Forest.
gbmale <- gbfull[which(gbfull$gender==0),]; gbmale <- gbmale[,-which(colnames(gbmale) == "gender")]
gbfemale <- gbfull[which(gbfull$gender==1),]; gbfemale <- gbfemale[,-which(colnames(gbfemale) == "gender")]
male.model.lin <- lm(cite~.,data=gbmale); female.model.lin <- lm(cite~.,data=gbfemale)
pred.lin <- unname(predict(male.model.lin,gbfemale))
t.test(pred.lin,gbfemale$cite)
male.model.rf <- randomForest(cite~.,data=gbmale)
pred.rf <- unname(predict(male.model.rf,gbfemale))
t.test(pred.rf,gbfemale$cite)
male.model.qr <- qr(cite~.,data=gbmale)