-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLogistic_Regression_Models_ESSdata.R
193 lines (155 loc) · 5.78 KB
/
Logistic_Regression_Models_ESSdata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#Set working directory
rm(list = ls())
setwd("~/PUBL0055")
library(readr)
#European Social Survey dataset
ess<- read_csv("ess.csv")
#Important packages for analysis and modelling
library(tidyverse)
library(tidyr)
library(dplyr)
library(texreg)
library(foreign)
#EDA of ESS dataset
str(ess)
summary(ess)
dim(ess)
names(ess)
#How clean is the data?
head(ess,15)
tail(ess,15)
#Are there any missing values?
sum(is.na(ess))#No missing values
#Variable coercion
#Turn trade union into a factor variable
str(ess$trade_union)
table(ess$trade_union)
ess$trade_union <-factor(ess$trade_union, levels = c(0,1), labels = c("Not-Member", "Member"))
summary(ess$trade_union)
class(ess$trade_union)
#Turn unemployed into a factor variable
str(ess$unemployed)
table(ess$unemployed)
ess$unemployed <-factor(ess$unemployed, levels = c(FALSE,TRUE), labels =c("Employed", "Unemployed"))
summary(ess$unemployed)
class(ess$trade_union)
#Visualization
library(ggplot2)
ggplot(ess,aes(x = years_education, y = leave, color = unemployed)) +
geom_jitter(width = 0, height = 0.09, alpha = 0.5)
#Take a look at the level of country attachment
str(ess$country_attach)
summary(ess$country_attach)
attach_country<-seq(0,10, length.out = 100)
summary(ess$country_attach)
str(ess$country_attach)
#Turn leave into a factor variable
str(ess$leave)
table(ess$leave)
ess$leave <- factor(ess$leave, levels =c(0,1), labels = c("no","yes"))
summary(ess$leave)
table(ess$leave)
str(ess$leave)
#Logistic Regression Modelliing
#Model 1
logit_M1 <- glm(leave ~ trade_union + unemployed + years_education + country_attach + eu_integration,
data = ess, family = binomial(link = "logit"))
screenreg(logit_M1)
summary(logit_M1)
#Model 2
logit_M2 <- glm(leave ~ years_education + immig_econ + trust_politicians,
data = ess, family = binomial(link = "logit"))
summary(logit_M2)
screenreg(logit_M2)
#Model 3
logit_M3 <- glm(leave ~ years_education + country_attach,
data = ess, family = binomial(link = "logit"))
summary(logit_M3)
screenreg(logit_M3)
logit_M4 <- glm(leave ~ years_education + eu_integration,
data = ess, family = binomial(link = "logit"))
summary(logit_M4)
screenreg(logit_M4)
#All values are statistically significant - publish table to word
htmlreg(list(logit_M1),file = "LogitModel.doc")
#FitStatistics
mean(ess$leave)
summary(ess$leave)
#Fitted Values and Predicted Probabilities
ess$pps1 <- predict(logit_M1, newdata = ess, type = "response")
ess$evs1 <- ifelse(ess$pps1 > 0.5, yes = 1, no = 0)
#Confusion matrix to find model fit - actual outcomes
confusion <- table(actual = ess$leave, expected.value = ess$evs1)
confusion #Expected values for leave and remain
sum(diag(confusion)) / sum(confusion) #There is an 82% success of prediction so therefore a strong model
#PredictedProbabilities:
#Likelihood to vote 'leave'; EU integration and number of years education
eu_integration_0<- predict(
logit_M4,
newdata = data.frame(years_education = 13, eu_integration = 0),
type = "response"
)
eu_integration_0 #Likelihood to vote leave is 52% given education only up to age 16
eu_integration_10<- predict(
logit_M4,
newdata = data.frame(years_education = 20, eu_integration = 5),
type = "response"
)
eu_integration_10 #Liklihood to vote leave is 10% given education at university level
#However, it is clear the variable, eu_integration is far a more signficant predictor in voting leave...
#Plot1
years_education_profiles <- data.frame(
years_education = seq(from = 0, to = 54, by = .5),
eu_integration = 0)
head(years_education_profiles)
years_education_profiles$predicted_probs <- predict(
logit_M4, newdata = years_education_profiles,
type = "response"
)
#Plot 1: Voting Leave by Years of Education:
ggplot(years_education_profiles, aes(x = years_education, y = predicted_probs)) +
geom_line(alpha = 0.5) + ylab("Probability of Voting Leave") + xlab("Number of Year of Education") + ggtitle("Voting Leave by Years of Education")
#Predicted probabilities of voting leave for those who a strongly attatched to their country
country_attatchment1 <- predict(
logit_M3,
newdata = data.frame(country_attach = 10, years_education = 13),
type = "response"
)
country_attatchment1
country_attatchment2 <- predict(
logit_M3,
newdata = data.frame(country_attach = 0, years_education = 13),
type = "response"
)
country_attatchment2#Those less emotionally attatched more likley to vote to remain
#Difference between the predicted probabilities
country_attatchment1 - country_attatchment2
#New model with better explanatory power
logit_M5 <- glm(leave~trade_union+unemployed+years_education+
country_attach+eu_integration+immig_econ+immig_culture,
data = ess, family = binomial(link = "logit"))
screenreg(logit_M5)
summary(logit_M5)
#Fit statistic for new model
mean(ess$leave)
summary(ess$leave)
ess$pps <- predict(logit_M5, newdata = ess, type = "response")
ess$evs <- ifelse(ess$pps > 0.5, yes = 1, no = 0)
#Confusion matrix to find model fit - actual outcomes
confusion <- table(actual = ess$leave, expected.value = ess$evs)
confusion
sum(diag(confusion)) / sum(confusion) #Comparing Model 1 and 5, we can see that the new model is slightly better.
#PredictiveProbabilities for Model 6
logit_M6 <- glm(leave~immig_econ+immig_culture,
data = ess, family = binomial(link = "logit"))
pred_prob_1 <- predict(
logit_M6,
newdata = data.frame(immig_culture = 0,immig_econ = 0),
type = "response")
pred_prob_1 #Those who are who think immigration is bad for economy have a 46% chance of voting to leave.
pred_prob_2 <- predict(
logit_M6,
newdata = data.frame(immig_culture = 5,immig_econ = 5),
type = "response")
pred_prob_2 #Those who are impartial have a 17% chance of voting to leave.
pred_prob_1-pred_prob_2 #Difference of 29%