# ==============================
# Analysis CEESEN_BENDER - all countries together
# Correlations, OLS, LASSO
# ==============================

# --- Packages to install ---
library(readxl)       # import Excel
library(dplyr)        # data wrangling
library(ggplot2)      # graphs
library(corrplot)     # heatmap correlations
library(GGally)       # scatterplot matrix
library(car)          # VIF
library(lmtest)       # diagnostic tests 
library(sandwich)     # robust SE
library(glmnet)       # LASSO
library(broom)        # tidy results

# --- 1. Import data ---
df<- read_excel("C:/Users/YOUR NAME/Documents/Projects/countries_bender.xlsx")


# --- 2. cleaning ---
# a) substitute " " with NA
df[df == " "] <- NA

# b) convert character variables in numeric where possible
df <- df %>%
  mutate(across(where(is.character), ~ suppressWarnings(as.numeric(.))))

# c) drop columns with too many NA (es. >90% missing)
threshold <- 0.9 * nrow(df)
df <- df[, colSums(is.na(df)) < threshold]

# double check
cat("Dimensions after cleaning:", dim(df), "\n")

# --- 3. dependent variable descriptive statistics ---
# avheat
summary(df$avheat)

# --- 4. Explorative analysis ---
# avheat distribution
ggplot(df, aes(x = avheat)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "white") +
  theme_minimal() +
  labs(title = "avheat distribution")
#positive asimmetry


# creation log

df$log_avheat <- log(df$avheat)

# distribution log_net_AVERAGE
ggplot(df, aes(x = log_avheat)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "white") +
  theme_minimal() +
  labs(title = "log_avheat distribution")

# --- Selection variables for correlation ---
vars_corr <- df %>%
  select(
    laperp, avliva, owner, renter,bavsalary,
    starts_with("resident"),
    children, workad, unemployed, retired, sassist,gfa, year, condition,
    avelectr
  )

# --- Pearson correlation matrix ---
cor_mat_p <- cor(vars_corr, use = "pairwise.complete.obs", method = "pearson")

corrplot(cor_mat_p, method = "color", type = "upper",
         tl.cex = 0.6, tl.col = "black",
         title = "Pearson correlation matrix", mar = c(0,0,2,0))

# --- Spearman correlation matrix ---
cor_mat_s <- cor(vars_corr, use = "pairwise.complete.obs", method = "spearman")

corrplot(cor_mat_s, method = "color", type = "upper",
         tl.cex = 0.6, tl.col = "black",
         title = "Spearman correlation matrix", mar = c(0,0,2,0))


# --- Pearson correlation (numeric) ---
round(cor_mat_p, 2)

# --- Spearman correlation (numeric) ---
round(cor_mat_s, 2)

# --- 5. MLR --- Baseline
model1 <- lm(log_avheat ~ bavsalary + laperp + country + owner,
             data = df %>% filter(is.finite(log_avheat)))

summary(model1)

# Diagnostic multicollinearity
vif(model1)

# Residuals
par(mfrow = c(2,2))
plot(model1)

# robust SE 
coeftest(model1, vcov = vcovHC(model1, type = "HC3"))


df$log_avheat_bis <- log1p(df$avheat)

# --- 5. MLR ---
model2 <- lm(log_avheat_bis ~ bavsalary + laperp + country + owner,
             data = df)

summary(model2)

# Diagnostic multicollinearity
vif(model2)

# Residuals
par(mfrow = c(2,2))
plot(model2)

# robust  SE
coeftest(model2, vcov = vcovHC(model2, type = "HC3"))

# does not change the situation. The first model will be kept.


# --- 5. MLR ---Test with renter instead of owner
model3 <- lm(log_avheat ~ bavsalary + laperp + country + renter,
             data = df %>% filter(is.finite(log_avheat)))

summary(model3)

# Diagnostic multicollinearity
vif(model3)

# Residuals
par(mfrow = c(2,2))
plot(model3)

# robust  SE
coeftest(model3, vcov = vcovHC(model3, type = "HC3"))

#renter or owner does nor really matter, let's keep owner


# --- 5. MLR ---Adding children under 18 year old %
model4 <- lm(log_avheat ~ bavsalary + laperp + country + owner + children,
             data = df %>% filter(is.finite(log_avheat)))


summary(model4)

# Diagnostic multicollinearitY
vif(model4)

# Residuals
par(mfrow = c(2,2))
plot(model4)

# robust SE
coeftest(model4, vcov = vcovHC(model4, type = "HC3"))

# children is not significant, also living area per person is not significant anymore and owner is only at 10% level significant


# --- 5. MLR ---Adding working adults 18-60 of age %
model5 <- lm(log_avheat ~ bavsalary + laperp + country + owner + workad,
             data = df %>% filter(is.finite(log_avheat)))

summary(model5)

# Diagnostic multicollinearity
vif(model5)

# Residuals
par(mfrow = c(2,2))
plot(model5)

# robust SE
coeftest(model5, vcov = vcovHC(model5, type = "HC3"))

# Adding the proportion of working adults shows multicolinearity problems vif is >5, SE test does not show significance. Salary and country stay significant


# --- 5. MLR ---Adding the proportion of retired
model6 <- lm(log_avheat ~ bavsalary + laperp + country + owner + retired,
             data = df %>% filter(is.finite(log_avheat)))


summary(model6)

# Diagnostic multicollinearity
vif(model6)

#multicollinearity is not good, retired shows 11, has to be removed

# Residuals
par(mfrow = c(2,2))
plot(model6)

# robust SE
coeftest(model6, vcov = vcovHC(model6, type = "HC3"))

#retired is not significant and has correlation with owner and country

# --- 5. MLR ---Adding the proportion of unemployed
model7 <- lm(log_avheat ~ bavsalary + laperp + country + owner + unemployed,
             data = df %>% filter(is.finite(log_avheat)))


summary(model7)

# Diagnostic multicollinearity
vif(model7)

#multicollinearity increased, but unemployed stays acceptable

# Residuals
par(mfrow = c(2,2))
plot(model7)

# robust SE
coeftest(model7, vcov = vcovHC(model7, type = "HC3"))

#unemployed dropped significance from 1% to 5% 


# --- 5. MLR ---Trying different number of residents in flats
model8 <- lm(log_avheat ~ bavsalary + laperp + country + owner + resident0,
             data = df %>% filter(is.finite(log_avheat)))

summary(model8)

# Diagnostic multicollinearity
vif(model8)


# Residuals
par(mfrow = c(2,2))
plot(model8)

# robust SE 
coeftest(model8, vcov = vcovHC(model8, type = "HC3"))

# resident0 significant, but affects country



# --- 5. MLR ---Trying different number of residents in flats
model9 <- lm(log_avheat ~ bavsalary + laperp + country + owner + resident1,
             data = df %>% filter(is.finite(log_avheat)))


summary(model9)

# Diagnostic multicollinearity
vif(model9)


# Residuals
par(mfrow = c(2,2))
plot(model9)

# robust SE
coeftest(model9, vcov = vcovHC(model9, type = "HC3"))

#resident1 vif coef is 7 - high, affects salary a lot. SE shows resident1 stat sign


# --- 5. MLR ---Trying different number of residents in flats
model10 <- lm(log_avheat ~ bavsalary + laperp + country + owner + resident2,
              data = df %>% filter(is.finite(log_avheat)))


summary(model10)

# Diagnostic multicollinearity
vif(model10)


# Residuals
par(mfrow = c(2,2))
plot(model10)

# robust SE 
coeftest(model10, vcov = vcovHC(model10, type = "HC3"))

# resident2 is significant but based on vif test is about 26 - very high



# --- 5. MLR ---Trying different number of residents in flats
model11 <- lm(log_avheat ~ bavsalary + laperp + country + owner + resident3,
              data = df %>% filter(is.finite(log_avheat)))


summary(model11)

# Diagnostic multicollinearity
vif(model11)


# Residuals
par(mfrow = c(2,2))
plot(model11)

# robust SE
coeftest(model11, vcov = vcovHC(model11, type = "HC3"))

# resident3 is significant but there is probably multicolinearity problem.



# --- 5. MLR ---Trying to add social assistance
model12 <- lm(log_avheat ~ bavsalary + laperp + country + owner + sassist,
              data = df %>% filter(is.finite(log_avheat)))

summary(model12)

# Diagnostic multicollinearity
vif(model12)


# Residuals
par(mfrow = c(2,2))
plot(model12)

# robust SE
coeftest(model12, vcov = vcovHC(model12, type = "HC3"))

# all is highly statistically significant and R2 is higher compared to model1. Would keep sassist


# --- 5.MLR --Ttrying to add technical characteristics, GFA
model13 <- lm(log_avheat ~ bavsalary + laperp + country + owner + gfa,
              data = df %>% filter(is.finite(log_avheat)))


summary(model13)

# Diagnostic multicollinearity
vif(model13)


# Residuals
par(mfrow = c(2,2))
plot(model13)

# robust SE
coeftest(model13, vcov = vcovHC(model13, type = "HC3"))

# clear multicolinearity issue - probably gfa correlates with salary



# --- 5. MLR ---Trying to add technical characteristics, condition
model14 <- lm(log_avheat ~ bavsalary + laperp + country + owner + condition,
              data = df %>% filter(is.finite(log_avheat)))

summary(model14)

# Diagnostic multicollinearity
vif(model14)


# Residuals
par(mfrow = c(2,2))
plot(model14)

# robust SE
coeftest(model14, vcov = vcovHC(model14, type = "HC3"))

# vif test failed and seems correlation with salary again


# --- 5. MLR ---Trying to add year
model15 <- lm(log_avheat ~ bavsalary + laperp + country + owner + year,
              data = df %>% filter(is.finite(log_avheat)))


summary(model15)

# Diagnostic multicollinearity
vif(model15)


# Residuals
par(mfrow = c(2,2))
plot(model15)

# robust SE
coeftest(model15, vcov = vcovHC(model15, type = "HC3"))

# Very significant coefficients, vif is ok

cor(df$bavsalary, df$log_avheat, use = "complete.obs")

#correlation with dependent variable quite high, think carefully if they are not too similar

# --- 5. MLR ---Trying to add average electricity consumption of a building
model16 <- lm(log_avheat ~ bavsalary + laperp + country + owner + avelectr,
              data = df %>% filter(is.finite(log_avheat)))


summary(model16)

# Diagnostic multicollinearity
vif(model16)


# Residuals
par(mfrow = c(2,2))
plot(model16)

# robust SE
coeftest(model16, vcov = vcovHC(model16, type = "HC3"))

# not significant


#Trying lasso

X <- model.matrix(
  log_avheat ~ bavsalary + laperp + country + owner + unemployed + resident0 + sassist + year, 
  data = df
)[, -1]

# Keep only rows in y corresponding to rows in X
y <- df$log_avheat[as.numeric(rownames(X))]



set.seed(123)

# --- 1. Split train/test ---
n <- nrow(X)
train_idx <- sample(1:n, size = 0.7*n)  # 70% training
X_train <- X[train_idx, ]
y_train <- y[train_idx]
X_test  <- X[-train_idx, ]
y_test  <- y[-train_idx]

# --- 2. Cross-validation on the training ---
cv_lasso <- cv.glmnet(
  X_train, y_train,
  alpha = 1,
  nfolds = 10,
  standardize = TRUE
)

# --- 3. Final model with optimal lambda ---
best_lambda <- cv_lasso$lambda.min
model_lasso <- glmnet(
  X_train, y_train,
  alpha = 1,
  lambda = best_lambda,
  standardize = TRUE
)

# --- 4. Predictions on test set ---
y_pred <- as.vector(predict(model_lasso, newx = X_test, s = best_lambda))

# --- 5. Performance out of sample ---
rmse <- sqrt(mean((y_test - y_pred)^2))
rsq  <- cor(y_test, y_pred)^2


rmse
rsq


# Get coefficients at the optimal lambda
lasso_coef <- coef(model_lasso, s = best_lambda)
lasso_coef


# Convert to a named vector and filter non-zero coefficients
selected_vars <- lasso_coef[lasso_coef != 0]
selected_vars

data.frame(
  Variable = rownames(lasso_coef),
  Coefficient = as.vector(lasso_coef)
) %>%
  filter(Coefficient != 0)

#  lasso does not have problem of multicollinearity, however very correlated variables can cause instability. I checked a couple of time and indeed there is some variations but not so much, we can repeat 4,5 times and see if the variables are the same, if yes, it is ok

# Assign weights to the variables (more weight means more importance)
weights <- c(0.4, 0.3, 0.15, 0.1, 0.05)

# Standardize the variables
df$country_std <- scale(df$country)
df$resident0_std <- scale(df$resident0)
df$unemployed_std <- scale(df$unemployed)
df$year_std <- scale(df$year)
df$bavsalary_std <- scale(df$bavsalary)

# Calculate a weighted score
df$WeightedScore <- with(
  df,
  country_std * weights[1] + 
  resident0_std * weights[2] + 
  unemployed_std * weights[3] + 
  year_std * weights[4] +
  bavsalary_std *weights[5]
)

# Rank based on the weighted score
df$Rank <- rank(-df$WeightedScore)  

# Drop rows with missing WeightedScore (NA)
df_clean <- df[!is.na(df$WeightedScore), ]


# Get bottom 122 buildings
bottom_122 <-df_clean[head(order(df_clean$Rank), 122),]

library(ggplot2)

# -------------------
# Plot Bottom 122
# -------------------
ggplot(bottom_122,aes(x = reorder(house, Rank), y = Rank, fill = Rank)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Bottom 10 House Rankings", 
       x = "House", 
       y = "Rank") +
  theme_minimal() +
  scale_fill_gradient(low = "red", high = "orange")


# Since for Slovenia there is no data for "unemployed", the final score should be calculated separately

weights <- c(0.4, 0.3, 0.2, 0.1)

df$WeightedScore1<- with(
  df,
  country_std * weights[1] + 
    resident0_std * weights[2] + 
    year_std * weights[3] +
    bavsalary_std *weights[4]
)

# Rank based on the weighted score
df$Rank1<- rank(-df$WeightedScore1) 

# Drop rows with missing WeightedScore (NA)
df_clean <- df[!is.na(df$WeightedScore1),]


# Get bottom 122 with 'unemployed' excluded

bottom_122 <-df_clean[head(order(df_clean$Rank1),122),]

ggplot(bottom_122,aes(x = reorder(house, Rank), y = Rank, fill = Rank)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Bottom 10 House Rankings", 
       x = "House", 
       y = "Rank") +
  theme_minimal() +
  scale_fill_gradient(low = "red", high = "orange")