Data Exercise

library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.3.3
library(dplyr)
Warning: package 'dplyr' was built under R version 4.3.3

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
#seed for reproducibility
set.seed(92)
#start generating data
n = 1200

#bone breaks in adolescents
age <- round(runif(n, 10, 18), 0)
break_type <- c("compound", "spiral", "transverse", "greenstick", "comminuted")
bone_breaks <- sample(break_type, n, replace = TRUE, prob = c(0.2, 0.15, 0.4, 0.38, 0.29))
break_num <- as.numeric(factor(bone_breaks, levels = break_type))

#surgery cost to fix broken bone
surgery_cost <- round(1500 + age * 10 + break_num * 200 + rnorm(n, mean = 0, sd = 50), 0)
#time in hours for surgery
surgery_time <- round(12 - age * 0.1 + rnorm(n, mean = 0, sd = 5), 1)

#percent of recovery without complications
recovery <- round(5 + surgery_cost * 0.005 - surgery_time * 0.05 + rnorm(n, mean = 0, sd = 1), 1)

#data frame for break data
break_data <- data.frame(age, break_type, surgery_cost, surgery_time, recovery)

summary(break_data)
      age         break_type         surgery_cost   surgery_time  
 Min.   :10.00   Length:1200        Min.   :1729   Min.   :-7.00  
 1st Qu.:12.00   Class :character   1st Qu.:2148   1st Qu.: 7.40  
 Median :14.00   Mode  :character   Median :2292   Median :10.60  
 Mean   :14.01                      Mean   :2296   Mean   :10.58  
 3rd Qu.:16.00                      3rd Qu.:2487   3rd Qu.:13.70  
 Max.   :18.00                      Max.   :2823   Max.   :32.20  
    recovery    
 Min.   :11.00  
 1st Qu.:14.90  
 Median :16.10  
 Mean   :16.01  
 3rd Qu.:17.20  
 Max.   :20.50  
#surgery cost by age
ggplot(break_data, aes(x = age, y = surgery_cost)) + 
  geom_point() + 
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'

#surgery cost by break type
ggplot(break_data, aes(x = break_type, y = surgery_cost)) + 
  geom_boxplot()

#surgery time by age
ggplot(break_data, aes(x = age, y = surgery_time)) + 
  geom_point() + 
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'

#surgery cost by time
ggplot(break_data, aes(x = surgery_time, y = surgery_cost)) + 
  geom_point() + 
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'

#recovery rate by surgery time
ggplot(break_data, aes(x = surgery_time, y = recovery)) + 
  geom_point() + 
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'

###Fit simple models to the data

#cost by age and break type model
cost_model <- lm(surgery_cost ~ age + break_type, data = break_data)
summary(cost_model)

Call:
lm(formula = surgery_cost ~ age + break_type, data = break_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-554.64 -150.89   -1.69  189.80  524.72 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          2202.709     46.912  46.954   <2e-16 ***
age                     5.856      3.172   1.846   0.0651 .  
break_typecompound     12.296     23.519   0.523   0.6012    
break_typegreenstick   24.524     23.512   1.043   0.2971    
break_typespiral       16.426     23.525   0.698   0.4852    
break_typetransverse    1.879     23.568   0.080   0.9365    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 257.5 on 1194 degrees of freedom
Multiple R-squared:  0.003977,  Adjusted R-squared:  -0.0001939 
F-statistic: 0.9535 on 5 and 1194 DF,  p-value: 0.4453
#time by age model
time_model <- lm(surgery_time ~ age, data = break_data)
summary(time_model)

Call:
lm(formula = surgery_time ~ age, data = break_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-17.1776  -3.2440   0.0192   3.1614  21.8880 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  12.4627     0.8666  14.382   <2e-16 ***
age          -0.1344     0.0610  -2.204   0.0277 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.973 on 1198 degrees of freedom
Multiple R-squared:  0.004037,  Adjusted R-squared:  0.003205 
F-statistic: 4.856 on 1 and 1198 DF,  p-value: 0.02774
#recovery rate by cost model
recovery_model <- lm(recovery ~ surgery_cost + surgery_time, data = break_data)
summary(recovery_model)

Call:
lm(formula = recovery ~ surgery_cost + surgery_time, data = break_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9103 -0.7416 -0.0134  0.7328  3.1936 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.045455   0.268072  18.821   <2e-16 ***
surgery_cost  0.005007   0.000114  43.902   <2e-16 ***
surgery_time -0.050294   0.005895  -8.531   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.016 on 1197 degrees of freedom
Multiple R-squared:  0.6222,    Adjusted R-squared:  0.6215 
F-statistic: 985.6 on 2 and 1197 DF,  p-value: < 2.2e-16
#exploring another type of model
recovery_glm <- glm(recovery ~ surgery_cost + surgery_time, family = gaussian(link = "identity"), data = break_data)
summary(recovery_glm)

Call:
glm(formula = recovery ~ surgery_cost + surgery_time, family = gaussian(link = "identity"), 
    data = break_data)

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.045455   0.268072  18.821   <2e-16 ***
surgery_cost  0.005007   0.000114  43.902   <2e-16 ***
surgery_time -0.050294   0.005895  -8.531   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 1.032061)

    Null deviance: 3269.7  on 1199  degrees of freedom
Residual deviance: 1235.4  on 1197  degrees of freedom
AIC: 3448.3

Number of Fisher Scoring iterations: 2