Skip to content

Commit

Permalink
lots of changes whoops
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Berry authored and Daniel Berry committed Nov 21, 2016
1 parent 6dce91f commit 9ce569e
Show file tree
Hide file tree
Showing 12 changed files with 47,157 additions and 46,612 deletions.
40 changes: 16 additions & 24 deletions ETL.r
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,6 @@
## Description: Code to transform data ##
#########################################

install.packages('sp')
install.packages('magrittr')
install.packages('stringr')
install.packages('geosphere')
install.packages('data.table')
install.packages('plyr')
install.packages('foreach')
install.packages('data.table')
install.packages('ggplot2')
install.packages('sp')
install.packages('rgeos')
install.packages('rgdal')

library(sp)
library(magrittr)
Expand Down Expand Up @@ -109,7 +97,7 @@ blocks_raw$CTA_counts <- CTA_counts$x

library(data.table)
library(plyr)
crimes <- fread('../rows.csv')
crimes <- fread('~/Downloads/Crimes_-_2001_to_present.csv')

doMC::registerDoMC(parallel::detectCores() - 3)

Expand All @@ -135,19 +123,22 @@ t2 <- t2[apply(is.na(t2),1,function(s) !any(s)),]
## return(c(df$GEOID10[1],count))
## }, .progress = 'text', .parallel = TRUE )

count_func <- function(i) {
print(i)
t1 <- as.matrix(blocks_raw[i,c('Latitude', 'Longitude')])
## count_func <- function(i) {
## print(i)
## t1 <- as.matrix(blocks_raw[i,c('Latitude', 'Longitude')])

dists <- spDists(t1,t2, longlat = TRUE)
## dists <- spDists(t1,t2, longlat = TRUE)

count <- sum(dists <= 1, na.rm = TRUE)
return(count)
}
## count <- sum(dists <= 1, na.rm = TRUE)
## return(c(i,count))
## }

## count_func(2340)

library(foreach)

result <- foreach(i=1:nrow(blocks_raw), .combine = c) %dopar% count_func(i)
## library(foreach)

## result <- foreach(i=1:nrow(blocks_raw), .combine = rbind) %dopar% count_func(i)


groceries <- read.csv('food-deserts-master/data/Grocery_Stores_-_2011.csv', stringsAsFactors = FALSE)
Expand Down Expand Up @@ -187,7 +178,7 @@ nrow(block_data <- merge(blocks_raw, population, by.x = 'TRACT_BLOC', by.y = 'CE

library(data.table)

crime <- fread('../rows.csv')
## crime <- fread('../rows.csv')

library(ggplot2)

Expand Down Expand Up @@ -327,7 +318,8 @@ all_data <- merge(block_data, public_health, by = 'Neighborhood', all.x = TRUE)
all_data <- merge(all_data, socioeconomic, by = 'Neighborhood', all.x = TRUE)
all_data <- merge(all_data, race, by = 'Neighborhood', all.x = TRUE)


load('result')
all_data$crime <- result[,2]

write.csv(all_data, file = 'all_data.csv')
save(all_data, file = 'all_data')
Expand Down
Binary file modified all_data
Binary file not shown.
92,672 changes: 46,336 additions & 46,336 deletions all_data.csv

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions auto/paper.el
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
(TeX-add-style-hook
"paper"
(lambda ()
(TeX-add-to-alist 'LaTeX-provided-package-options
'(("geometry" "margin=1in")))
(TeX-run-style-hooks
"latex2e"
"IEEEtran"
"IEEEtran10"))
"report"
"rep10"
"geometry"
"amsmath")
(LaTeX-add-labels
"cpresult"
"npresult"
"ppresult"))
:latex)

257 changes: 253 additions & 4 deletions model.r
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,20 @@ model_data <- subset(all_data, TOTAL.POPULATION > 0)
## Complete pooling ##
######################

cp <- glm(desert ~ CTA_counts + vacant_counts, family = 'binomial', data = model_data)
cp <- glm(desert ~ CTA_counts + vacant_counts + crime, family = 'binomial', data = model_data)
summary(cp)
################
## No pooling ##
################

np <- glm(desert ~ CTA_counts + vacant_counts + Neighborhood, family = 'binomial', data = model_data)
np <- glm(desert ~ CTA_counts + vacant_counts + crime + Neighborhood, family = 'binomial', data = model_data)
summary(np)

#####################
## Partial pooling ##
#####################

pp <- glmer(desert ~ CTA_counts + vacant_counts + (1 | Neighborhood), data = model_data, family = 'binomial')
pp <- glmer(desert ~ CTA_counts + vacant_counts + crime + (1 | Neighborhood), data = model_data, family = 'binomial')
summary(pp)

##################
Expand All @@ -54,6 +54,96 @@ potential_covariates <- setdiff(names(all_data), exclude)
model_data_scale <- model_data
for (var in potential_covariates) {model_data_scale[var] <- as.numeric(scale(model_data[var]))}

library(mice)

vars <- c(
"vacant_counts" ,
"CTA_counts" ,
"store_counts" ,
"nearest_supermarket" ,
"TOTAL.POPULATION" ,
"desert" ,
"Community.Area.x" ,
"Birth.Rate" ,
"General.Fertility.Rate" ,
"Low.Birth.Weight" ,
"Prenatal.Care.Beginning.in.First.Trimester" ,
"Preterm.Births" ,
"Teen.Birth.Rate" ,
"Assault..Homicide." ,
"Breast.cancer.in.females" ,
"Cancer..All.Sites." ,
"Colorectal.Cancer" ,
"Diabetes.related" ,
"Firearm.related" ,
"Infant.Mortality.Rate" ,
"Lung.Cancer" ,
"Prostate.Cancer.in.Males" ,
"Stroke..Cerebrovascular.Disease." ,
"Childhood.Blood.Lead.Level.Screening" ,
"Childhood.Lead.Poisoning" ,
"Gonorrhea.in.Females" ,
"Gonorrhea.in.Males" ,
"Tuberculosis" ,
"Below.Poverty.Level" ,
"Crowded.Housing" ,
"Dependency" ,
"No.High.School.Diploma" ,
"Per.Capita.Income" ,
"Unemployment" ,
"Community.Area.Number" ,
"PERCENT.OF.HOUSING.CROWDED" ,
"PERCENT.HOUSEHOLDS.BELOW.POVERTY" ,
"PERCENT.AGED.16..UNEMPLOYED" ,
"PERCENT.AGED.25..WITHOUT.HIGH.SCHOOL.DIPLOMA",
"PERCENT.AGED.UNDER.18.OR.OVER.64" ,
"PER.CAPITA.INCOME" ,
"HARDSHIP.INDEX" ,
"Community.Area.y" ,
"NHW" ,
"NHB" ,
"NHAM" ,
"NHAS" ,
"NHOTHER" ,
"HISP" ,
"Multiple.Race.." ,
"TOTAL" ,
"NHW_p" ,
"NHB_p" ,
"NHAM_p" ,
"NHAS_p" ,
"NHOTHER_p" ,
"HISP_p" ,
"Multiple.Race.._p")

tmp_data <- mice(model_data_scale[, vars])

complete_datas <- lapply(1:5, function(i) complete(tmp_data, i))
complete_datas2 <- lapply(complete_datas, function(dat) {dat$Neighborhood <- model_data_scale$Neighborhood; dat})

models <- lapply(complete_datas2, function(dat) glmer(desert ~ CTA_counts + vacant_counts + (1 | Neighborhood), data = dat, family = 'binomial'))

fits <- with(tmp_data, glmer(desert ~ CTA_counts + vacant_counts + (1 | Neighborhood), family = 'binomial'))
pooled <- pool(fits)

tmp_data <- mice(model_data_scale[,!(names(model_data_scale) %in% c('Neighborhood', 'Community.Area.x', 'Community.Area.y'))])

cp <- glm(desert ~ CTA_counts + vacant_counts + crime, family = 'binomial', data = model_data_scale)
summary(cp)
################
## No pooling ##
################

np <- glm(desert ~ CTA_counts + vacant_counts + crime + Neighborhood, family = 'binomial', data = model_data_scale)
summary(np)

#####################
## Partial pooling ##
#####################

pp <- glmer(desert ~ CTA_counts + vacant_counts + crime + (1 | Neighborhood), data = model_data_scale, family = 'binomial', verbose = TRUE)
summary(pp)

## mlm_c <- glmer(desert ~ CTA_counts + vacant_counts + Diabetes.related + (1 | Neighborhood),
## data = model_data_scale,
## family = 'binomial')
Expand Down Expand Up @@ -122,6 +212,15 @@ registerDoMC(detectCores() - 3)

old_aic <- AIC(best_model)

form <- as.formula(paste('desert ~ vacant_counts + CTA_counts +',
paste(setdiff(search_covariates,
c('NHW', 'NHW_p', 'Multiple.Race..', 'Multiple.Race.._p', "PERCENT.AGED.UNDER.18.OR.OVER.64",
"PERCENT.HOUSEHOLDS.BELOW.POVERTY")), collapse = '+'),
'+(1|Neighborhood)'))

complete_model <- glmer(form, data = model_data_scale, family = 'binomial', verbose = TRUE)


fit_model <- function(i) {
print(paste('fitting:',i))
form <- as.formula(paste('desert ~ vacant_counts + CTA_counts +',
Expand All @@ -137,10 +236,160 @@ model <- glmer(desert ~ CTA_counts + vacant_counts + Gonorrhea.in.Females + Canc
Teen.Birth.Rate + No.High.School.Diploma + Lung.Cancer +
(1|Neighborhood),
data = model_data_scale,
family = 'binomial')
family = 'binomial',
verbose = TRUE)

model2 <- glmer(desert ~ CTA_counts +
vacant_counts +
Gonorrhea.in.Females +
Cancer..All.Sites. +
TOTAL.POPULATION +
NHAS +
Dependency +
Childhood.Lead.Poisoning +
Prenatal.Care.Beginning.in.First.Trimester +
Gonorrhea.in.Males +
NHAM_p +
Multiple.Race.. +
Stroke..Cerebrovascular.Disease. +
Firearm.related +
Tuberculosis +
NHW_p +
Teen.Birth.Rate +
No.High.School.Diploma +
Lung.Cancer +
Colorectal.Cancer +
Low.Birth.Weight +
Preterm.Births +
Assault..Homicide. +
Breast.cancer.in.females +
(1|Neighborhood),
data = model_data_scale,
family = 'binomial',
verbose = TRUE,
control = glmerControl(calc.derivs = FALSE))



model3 <- glmer(desert ~ CTA_counts +
vacant_counts +
Gonorrhea.in.Females +
TOTAL.POPULATION +
NHAS +
Dependency +
Childhood.Lead.Poisoning +
Prenatal.Care.Beginning.in.First.Trimester +
Gonorrhea.in.Males +
NHAM_p +
Multiple.Race.. +
Stroke..Cerebrovascular.Disease. +
Tuberculosis +
Teen.Birth.Rate +
No.High.School.Diploma +
Lung.Cancer +
Colorectal.Cancer +
(1|Neighborhood),
data = model_data_scale,
family = 'binomial',
verbose = TRUE,
control = glmerControl(calc.derivs = FALSE), optCtrl=list(maxfun=5000))

model4 <- glmer(desert ~ CTA_counts +
vacant_counts +
crime +
TOTAL.POPULATION +
NHAS +
Dependency +
Childhood.Lead.Poisoning +
Prenatal.Care.Beginning.in.First.Trimester +
NHAM_p +
Multiple.Race.. +
Stroke..Cerebrovascular.Disease. +
Tuberculosis +
Teen.Birth.Rate +
No.High.School.Diploma +
Lung.Cancer +
Colorectal.Cancer +
(1|Neighborhood),
data = model_data_scale,
family = 'binomial',
verbose = TRUE,
control = glmerControl(calc.derivs = FALSE, optCtrl=list(maxfun=5000)))

t_data <- complete_datas2[[1]]

model4 <- glmer(desert ~ CTA_counts +
vacant_counts +
TOTAL.POPULATION +
NHAS +
Dependency +
Childhood.Lead.Poisoning +
Prenatal.Care.Beginning.in.First.Trimester +
NHAM_p +.[.---------------[-]]
Multiple.Race.. +
Stroke..Cerebrovascular.Disease. +
Tuberculosis +
Teen.Birth.Rate +
No.High.School.Diploma +
Lung.Cancer +
Colorectal.Cancer +
(1|Neighborhood),
data = t_data,
family = 'binomial',
verbose = TRUE,
control = glmerControl(calc.derivs = FALSE, optCtrl=list(maxfun=5000)))

cp_mse <- c(); np_mse <- c(); pp_mse <- c(); mlm_mse <- c();

for (i in 1:10) {
cv_ind <- runif(nrow(model_data_scale)) < .8
train <- model_data_scale[cv_ind,]
test <- model_data_scale[!cv_ind,]

print(paste('TRAINING: ', i))

cp <- glm(desert ~ CTA_counts + vacant_counts + crime, family = 'binomial', data = train)
print(paste('AIC cp:', AIC(cp)))

np <- glm(desert ~ CTA_counts + vacant_counts + crime + Neighborhood, family = 'binomial', data = train)
print(paste('AIC np:', AIC(np)))

pp <- glmer(desert ~ CTA_counts + vacant_counts + crime + (1 | Neighborhood), data = train, family = 'binomial')
print(paste('AIC pp:', AIC(pp)))

mlm <- glmer(desert ~ CTA_counts +
vacant_counts +
crime +
TOTAL.POPULATION +
NHAS +
Dependency +
Childhood.Lead.Poisoning +
Prenatal.Care.Beginning.in.First.Trimester +
NHAM_p +
Multiple.Race.. +
Stroke..Cerebrovascular.Disease. +
Tuberculosis +
Teen.Birth.Rate +
No.High.School.Diploma +
Lung.Cancer +
Colorectal.Cancer +
(1|Neighborhood),
data = train,
family = 'binomial',
control = glmerControl(calc.derivs = FALSE, optCtrl=list(maxfun=5000)))
print(paste('AIC mlm:', AIC(mlm)))

print(paste('EVALUATING: ', i))



}


summary(model)

save(result, file = 'result')

in_vars <- c("Gonorrhea.in.Females" , 'Cancer..All.Sites.' , 'TOTAL.POPULATION' , 'NHAS' ,
'Dependency' ,'Childhood.Lead.Poisoning' , 'Prenatal.Care.Beginning.in.First.Trimester' , 'Gonorrhea.in.Males'
, 'NHAM_p' , 'Multiple.Race..' , 'Stroke..Cerebrovascular.Disease.' , 'Firearm.related' , 'Tuberculosis' , 'NHW_p' ,
Expand Down
Loading

0 comments on commit 9ce569e

Please sign in to comment.