# ============================== # Analysis CEESEN_BENDER - all countries together # Correlations, OLS, LASSO # ============================== # --- Packages to install --- library(readxl) # import Excel library(dplyr) # data wrangling library(ggplot2) # graphs library(corrplot) # heatmap correlations library(GGally) # scatterplot matrix library(car) # VIF library(lmtest) # diagnostic tests library(sandwich) # robust SE library(glmnet) # LASSO library(broom) # tidy results # --- 1. Import data --- df<- read_excel("C:/Users/YOUR NAME/Documents/Projects/countries_bender.xlsx") # --- 2. cleaning --- # a) substitute " " with NA df[df == " "] <- NA # b) convert character variables in numeric where possible df <- df %>% mutate(across(where(is.character), ~ suppressWarnings(as.numeric(.)))) # c) drop columns with too many NA (es. >90% missing) threshold <- 0.9 * nrow(df) df <- df[, colSums(is.na(df)) < threshold] # double check cat("Dimensions after cleaning:", dim(df), "\n") # --- 3. dependent variable descriptive statistics --- # avheat summary(df$avheat) # --- 4. Explorative analysis --- # avheat distribution ggplot(df, aes(x = avheat)) + geom_histogram(bins = 30, fill = "skyblue", color = "white") + theme_minimal() + labs(title = "avheat distribution") #positive asimmetry # creation log df$log_avheat <- log(df$avheat) # distribution log_net_AVERAGE ggplot(df, aes(x = log_avheat)) + geom_histogram(bins = 30, fill = "skyblue", color = "white") + theme_minimal() + labs(title = "log_avheat distribution") # --- Selection variables for correlation --- vars_corr <- df %>% select( laperp, avliva, owner, renter,bavsalary, starts_with("resident"), children, workad, unemployed, retired, sassist,gfa, year, condition, avelectr ) # --- Pearson correlation matrix --- cor_mat_p <- cor(vars_corr, use = "pairwise.complete.obs", method = "pearson") corrplot(cor_mat_p, method = "color", type = "upper", tl.cex = 0.6, tl.col = "black", title = "Pearson correlation matrix", mar = c(0,0,2,0)) # --- Spearman correlation matrix --- cor_mat_s <- cor(vars_corr, use = "pairwise.complete.obs", method = "spearman") corrplot(cor_mat_s, method = "color", type = "upper", tl.cex = 0.6, tl.col = "black", title = "Spearman correlation matrix", mar = c(0,0,2,0)) # --- Pearson correlation (numeric) --- round(cor_mat_p, 2) # --- Spearman correlation (numeric) --- round(cor_mat_s, 2) # --- 5. MLR --- Baseline model1 <- lm(log_avheat ~ bavsalary + laperp + country + owner, data = df %>% filter(is.finite(log_avheat))) summary(model1) # Diagnostic multicollinearity vif(model1) # Residuals par(mfrow = c(2,2)) plot(model1) # robust SE coeftest(model1, vcov = vcovHC(model1, type = "HC3")) df$log_avheat_bis <- log1p(df$avheat) # --- 5. MLR --- model2 <- lm(log_avheat_bis ~ bavsalary + laperp + country + owner, data = df) summary(model2) # Diagnostic multicollinearity vif(model2) # Residuals par(mfrow = c(2,2)) plot(model2) # robust SE coeftest(model2, vcov = vcovHC(model2, type = "HC3")) # does not change the situation. The first model will be kept. # --- 5. MLR ---Test with renter instead of owner model3 <- lm(log_avheat ~ bavsalary + laperp + country + renter, data = df %>% filter(is.finite(log_avheat))) summary(model3) # Diagnostic multicollinearity vif(model3) # Residuals par(mfrow = c(2,2)) plot(model3) # robust SE coeftest(model3, vcov = vcovHC(model3, type = "HC3")) #renter or owner does nor really matter, let's keep owner # --- 5. MLR ---Adding children under 18 year old % model4 <- lm(log_avheat ~ bavsalary + laperp + country + owner + children, data = df %>% filter(is.finite(log_avheat))) summary(model4) # Diagnostic multicollinearitY vif(model4) # Residuals par(mfrow = c(2,2)) plot(model4) # robust SE coeftest(model4, vcov = vcovHC(model4, type = "HC3")) # children is not significant, also living area per person is not significant anymore and owner is only at 10% level significant # --- 5. MLR ---Adding working adults 18-60 of age % model5 <- lm(log_avheat ~ bavsalary + laperp + country + owner + workad, data = df %>% filter(is.finite(log_avheat))) summary(model5) # Diagnostic multicollinearity vif(model5) # Residuals par(mfrow = c(2,2)) plot(model5) # robust SE coeftest(model5, vcov = vcovHC(model5, type = "HC3")) # Adding the proportion of working adults shows multicolinearity problems vif is >5, SE test does not show significance. Salary and country stay significant # --- 5. MLR ---Adding the proportion of retired model6 <- lm(log_avheat ~ bavsalary + laperp + country + owner + retired, data = df %>% filter(is.finite(log_avheat))) summary(model6) # Diagnostic multicollinearity vif(model6) #multicollinearity is not good, retired shows 11, has to be removed # Residuals par(mfrow = c(2,2)) plot(model6) # robust SE coeftest(model6, vcov = vcovHC(model6, type = "HC3")) #retired is not significant and has correlation with owner and country # --- 5. MLR ---Adding the proportion of unemployed model7 <- lm(log_avheat ~ bavsalary + laperp + country + owner + unemployed, data = df %>% filter(is.finite(log_avheat))) summary(model7) # Diagnostic multicollinearity vif(model7) #multicollinearity increased, but unemployed stays acceptable # Residuals par(mfrow = c(2,2)) plot(model7) # robust SE coeftest(model7, vcov = vcovHC(model7, type = "HC3")) #unemployed dropped significance from 1% to 5% # --- 5. MLR ---Trying different number of residents in flats model8 <- lm(log_avheat ~ bavsalary + laperp + country + owner + resident0, data = df %>% filter(is.finite(log_avheat))) summary(model8) # Diagnostic multicollinearity vif(model8) # Residuals par(mfrow = c(2,2)) plot(model8) # robust SE coeftest(model8, vcov = vcovHC(model8, type = "HC3")) # resident0 significant, but affects country # --- 5. MLR ---Trying different number of residents in flats model9 <- lm(log_avheat ~ bavsalary + laperp + country + owner + resident1, data = df %>% filter(is.finite(log_avheat))) summary(model9) # Diagnostic multicollinearity vif(model9) # Residuals par(mfrow = c(2,2)) plot(model9) # robust SE coeftest(model9, vcov = vcovHC(model9, type = "HC3")) #resident1 vif coef is 7 - high, affects salary a lot. SE shows resident1 stat sign # --- 5. MLR ---Trying different number of residents in flats model10 <- lm(log_avheat ~ bavsalary + laperp + country + owner + resident2, data = df %>% filter(is.finite(log_avheat))) summary(model10) # Diagnostic multicollinearity vif(model10) # Residuals par(mfrow = c(2,2)) plot(model10) # robust SE coeftest(model10, vcov = vcovHC(model10, type = "HC3")) # resident2 is significant but based on vif test is about 26 - very high # --- 5. MLR ---Trying different number of residents in flats model11 <- lm(log_avheat ~ bavsalary + laperp + country + owner + resident3, data = df %>% filter(is.finite(log_avheat))) summary(model11) # Diagnostic multicollinearity vif(model11) # Residuals par(mfrow = c(2,2)) plot(model11) # robust SE coeftest(model11, vcov = vcovHC(model11, type = "HC3")) # resident3 is significant but there is probably multicolinearity problem. # --- 5. MLR ---Trying to add social assistance model12 <- lm(log_avheat ~ bavsalary + laperp + country + owner + sassist, data = df %>% filter(is.finite(log_avheat))) summary(model12) # Diagnostic multicollinearity vif(model12) # Residuals par(mfrow = c(2,2)) plot(model12) # robust SE coeftest(model12, vcov = vcovHC(model12, type = "HC3")) # all is highly statistically significant and R2 is higher compared to model1. Would keep sassist # --- 5.MLR --Ttrying to add technical characteristics, GFA model13 <- lm(log_avheat ~ bavsalary + laperp + country + owner + gfa, data = df %>% filter(is.finite(log_avheat))) summary(model13) # Diagnostic multicollinearity vif(model13) # Residuals par(mfrow = c(2,2)) plot(model13) # robust SE coeftest(model13, vcov = vcovHC(model13, type = "HC3")) # clear multicolinearity issue - probably gfa correlates with salary # --- 5. MLR ---Trying to add technical characteristics, condition model14 <- lm(log_avheat ~ bavsalary + laperp + country + owner + condition, data = df %>% filter(is.finite(log_avheat))) summary(model14) # Diagnostic multicollinearity vif(model14) # Residuals par(mfrow = c(2,2)) plot(model14) # robust SE coeftest(model14, vcov = vcovHC(model14, type = "HC3")) # vif test failed and seems correlation with salary again # --- 5. MLR ---Trying to add year model15 <- lm(log_avheat ~ bavsalary + laperp + country + owner + year, data = df %>% filter(is.finite(log_avheat))) summary(model15) # Diagnostic multicollinearity vif(model15) # Residuals par(mfrow = c(2,2)) plot(model15) # robust SE coeftest(model15, vcov = vcovHC(model15, type = "HC3")) # Very significant coefficients, vif is ok cor(df$bavsalary, df$log_avheat, use = "complete.obs") #correlation with dependent variable quite high, think carefully if they are not too similar # --- 5. MLR ---Trying to add average electricity consumption of a building model16 <- lm(log_avheat ~ bavsalary + laperp + country + owner + avelectr, data = df %>% filter(is.finite(log_avheat))) summary(model16) # Diagnostic multicollinearity vif(model16) # Residuals par(mfrow = c(2,2)) plot(model16) # robust SE coeftest(model16, vcov = vcovHC(model16, type = "HC3")) # not significant #Trying lasso X <- model.matrix( log_avheat ~ bavsalary + laperp + country + owner + unemployed + resident0 + sassist + year, data = df )[, -1] # Keep only rows in y corresponding to rows in X y <- df$log_avheat[as.numeric(rownames(X))] set.seed(123) # --- 1. Split train/test --- n <- nrow(X) train_idx <- sample(1:n, size = 0.7*n) # 70% training X_train <- X[train_idx, ] y_train <- y[train_idx] X_test <- X[-train_idx, ] y_test <- y[-train_idx] # --- 2. Cross-validation on the training --- cv_lasso <- cv.glmnet( X_train, y_train, alpha = 1, nfolds = 10, standardize = TRUE ) # --- 3. Final model with optimal lambda --- best_lambda <- cv_lasso$lambda.min model_lasso <- glmnet( X_train, y_train, alpha = 1, lambda = best_lambda, standardize = TRUE ) # --- 4. Predictions on test set --- y_pred <- as.vector(predict(model_lasso, newx = X_test, s = best_lambda)) # --- 5. Performance out of sample --- rmse <- sqrt(mean((y_test - y_pred)^2)) rsq <- cor(y_test, y_pred)^2 rmse rsq # Get coefficients at the optimal lambda lasso_coef <- coef(model_lasso, s = best_lambda) lasso_coef # Convert to a named vector and filter non-zero coefficients selected_vars <- lasso_coef[lasso_coef != 0] selected_vars data.frame( Variable = rownames(lasso_coef), Coefficient = as.vector(lasso_coef) ) %>% filter(Coefficient != 0) # lasso does not have problem of multicollinearity, however very correlated variables can cause instability. I checked a couple of time and indeed there is some variations but not so much, we can repeat 4,5 times and see if the variables are the same, if yes, it is ok # Assign weights to the variables (more weight means more importance) weights <- c(0.4, 0.3, 0.15, 0.1, 0.05) # Standardize the variables df$country_std <- scale(df$country) df$resident0_std <- scale(df$resident0) df$unemployed_std <- scale(df$unemployed) df$year_std <- scale(df$year) df$bavsalary_std <- scale(df$bavsalary) # Calculate a weighted score df$WeightedScore <- with( df, country_std * weights[1] + resident0_std * weights[2] + unemployed_std * weights[3] + year_std * weights[4] + bavsalary_std *weights[5] ) # Rank based on the weighted score df$Rank <- rank(-df$WeightedScore) # Drop rows with missing WeightedScore (NA) df_clean <- df[!is.na(df$WeightedScore), ] # Get bottom 122 buildings bottom_122 <-df_clean[head(order(df_clean$Rank), 122),] library(ggplot2) # ------------------- # Plot Bottom 122 # ------------------- ggplot(bottom_122,aes(x = reorder(house, Rank), y = Rank, fill = Rank)) + geom_bar(stat = "identity", show.legend = FALSE) + coord_flip() + labs(title = "Bottom 10 House Rankings", x = "House", y = "Rank") + theme_minimal() + scale_fill_gradient(low = "red", high = "orange") # Since for Slovenia there is no data for "unemployed", the final score should be calculated separately weights <- c(0.4, 0.3, 0.2, 0.1) df$WeightedScore1<- with( df, country_std * weights[1] + resident0_std * weights[2] + year_std * weights[3] + bavsalary_std *weights[4] ) # Rank based on the weighted score df$Rank1<- rank(-df$WeightedScore1) # Drop rows with missing WeightedScore (NA) df_clean <- df[!is.na(df$WeightedScore1),] # Get bottom 122 with 'unemployed' excluded bottom_122 <-df_clean[head(order(df_clean$Rank1),122),] ggplot(bottom_122,aes(x = reorder(house, Rank), y = Rank, fill = Rank)) + geom_bar(stat = "identity", show.legend = FALSE) + coord_flip() + labs(title = "Bottom 10 House Rankings", x = "House", y = "Rank") + theme_minimal() + scale_fill_gradient(low = "red", high = "orange")