-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy path1a. Simulations - Save Data.R
85 lines (76 loc) · 2.51 KB
/
1a. Simulations - Save Data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
library(tidyverse)
library(RPostgres)
library(fixest)
library(e1071)
library(kableExtra)
library(ggthemes)
library(patchwork)
library(did)
library(furrr)
library(latex2exp)
library(bacondecomp)
library(ggforce)
library(fastDummies)
library(progressr)
# source my passwords
source('/Users/andrew/Box Sync/Projects/Passwords/Password.R')
# Connect to WRDS Server --------------------------------------------------
wrds <- dbConnect(Postgres(),
host = 'wrds-pgdata.wharton.upenn.edu',
port = 9737,
user = user,
password = password,
dbname = 'wrds',
sslmode = 'require')
# download compustat data
comp <- tbl(wrds, sql("SELECT gvkey, fyear, oibdp, at, indfmt, datafmt, popsrc, consol, fic, sich FROM comp.funda")) %>%
# filter the data - between 1979 and 2015, non-missing assets, and in the US
filter(indfmt == 'INDL' & datafmt == 'STD' & popsrc == 'D' & consol == 'C' & !is.na(fyear) &
fyear %>% between(1979, 2015) & !is.na(at) & at > 0 & fic == "USA") %>%
# make ROA variable
group_by(gvkey) %>%
mutate(roa = oibdp / lag(at),
gvkey = as.numeric(gvkey)) %>%
# drop missing ROA
filter(!is.na(roa)) %>%
# drop 1979 observations - just needed lagged assets
filter(fyear >= 1980) %>%
ungroup() %>%
collect()
# download comp header which has more info on location etc
comp_header <- tbl(wrds, sql("SELECT * FROM crsp.comphead")) %>%
mutate(gvkey = as.numeric(gvkey)) %>%
collect()
# merge in state of incorporation and industry information
comp <- comp %>%
left_join(comp_header %>% select(gvkey, incorp, sic)) %>%
# drop if state of incorporation is missing or not 50 states
filter(!is.na(incorp) & incorp %in% state.abb) %>%
# clean up SIC code - use historical sic code if available, if not use header sic
mutate(sich = coalesce(sich, sic)) %>%
# drop financial firms
filter(!(sich %in% c(6000:6999)))
# make sure that each firm has at least 5 observations
comp <- comp %>%
group_by(gvkey) %>%
add_tally() %>%
filter(n >= 5) %>%
ungroup()
# winsorize ROA at 99, and censor at -1
wins <- function(x) {
# winsorize and return
case_when(
is.na(x) ~ NA_real_,
x < -1 ~ -1,
x > quantile(x, 0.99, na.rm = TRUE) ~ quantile(x, 0.99, na.rm = TRUE),
TRUE ~ x
)
}
# winsorize ROA by year
comp <- comp %>%
group_by(fyear) %>%
mutate(roa = wins(roa)) %>%
arrange(gvkey, fyear) %>%
ungroup()
# save
saveRDS(comp, here::here("Data", "simulation_data.rds"))