library(robustbase)
library(ggplot2)
library(dplyr)
library(corrplot)
library(zoo)
library("SparkR")
library(ggplot2)
library(plm)
library(lmtest)
library(ecm)
library(car)
library(stargazer)
#loading data
rm(list = ls())
#Loading the data
data<-load(file="easySHARE_rel7_1_0.rda")
data<-easySHARE_rel7_1_0
View(data)
#descriptive statistics
#list of relevant variables
var_model<-c("numeracy_1","age", "female","books_age10", "eurod", "br010_mod", "eduyears_mod", "ch001_", "ch021_mod", "numeracy_dummy", "wave" )

#selecting data from waves 5
data_var<-select(data, c("numeracy_1","age", "female","books_age10", "eurod", "br010_mod", "eduyears_mod", "ch001_", "ch021_mod","wave"))
summary(data_var)

data_v5<-data_var[data_var$wave==5&data_var$books_age10>0&data_var$numeracy_1>=0&data_var$eurod>=0&data_var$eurod<=6&data_var$ch001_>0&data_var$age>=50&data_var$br010_mod>0&data_var$ch021_mod>0&data_var$ch021_mod<9.5&data_var$eduyears_mod>0,]
  data_v5$numeracy_dummy<-c(data_v5$numeracy_1>=4)

summary(data_var)
summary(data_v5)
count(data_v5)

stargazer(data_v5)


#descriptive statistics:
#1 summary (mean, variation max, ,min)
#2 boxplot(data), check outliers
#3 plot the data in histogram or graph again age or other variable
#4 describe data and add analytical thoughts, other sources

#lets check correlation matrix to identify any relationship between our potentially independent variables
  
cor_matrix<-cor(data_v5[,-c(1,10)])


stargazer(cor_matrix, title="Correlation Matrix")

  
# we did not identify any significant relationship 
# interesing relationships are books_age10 vs.eduyears_mod, ch001_ vs ch021_mod and  ch021_mod vs  age)

print(cor_matrix)

cor(data_v5)
round(cor(data_v5),3)
  
#Dependent variable numeracy
summary(data_v5$numeracy_1)
boxplot(data_v5$numeracy_1)#outliers are reasonable after omitting negative values, hence no need to eliminate any
  title(main="Numeracy_1")
hist(data_v5$numeracy_1, main="")
  title(main="Numeracy_1")
 
#Numeracy dummy  - transformation of the variable to better reflect the needs of proposed logit model (0/1)
summary(data_v5$numeracy_dummy)
barplot(table(data_v5$numeracy_dummy), main="Numeracy Pass/Fail Distribution",xlab="Number of Fails and Succeses in the test",names=c("Fail", "Pass"))

#Calculating some statistics about the dependent numeracy variable, around 51% of the respondents are considered to "fail" the test
count(data_v5[data_v5$numeracy_1==0,])
count(data_v5[data_v5$numeracy_1==1,])
count(data_v5[data_v5$numeracy_1==2,])
count(data_v5[data_v5$numeracy_1==3,])
count(data_v5[data_v5$numeracy_1==4,])
count(data_v5[data_v5$numeracy_1==5,])
sumar<-summary(as.factor(data_v5$numeracy_1))
print(sumar)
summary(data_v5$numeracy_1)
hist(data_v5$numeracy_1)
soucet<-sumar[1]+sumar[2]+sumar[3]
soucet/count(data_v5) #50 něco % má 4 a menší
sumar[1]

#Age of the respondetns
summary(data_v5$age)
#We found one observations that is smaller than 15 and hence we had to apply new condition for initial data selection
boxplot(data_v5$age)
  title(main="Boxplot of Age")
  #we set an abritrary treshold to determine whether a respondent is "old enough to participate in our study
  #Hence we set minimal age to 50 years
hist(data_v5$age, main="Histogram of Age", xlab="Age in years")

count(data_v5[data_v5$age>=90,])


count(data_var[data_var$eduyears_mod<9,])



#Gender variable (female), since this is a dummy variable we do not expect any outliers to be present
#creating some descriptive statistics to bring some perspective to the variable
summary(data_v5$female)#from the summary it is 
age_bar<-barplot(table(data_v5$female), main="Gender Distribution", xlab="Number of males and females", names=c("Male", "Female"), col=c("blue2", "red2"))
  text(age_bar, 2000, table(data_v5$female),cex=1,pos=3) 

#Books_age_10 - amouunt of book when 10 years old, it is not linear
  #since it is the variable of interest
  #no need to eliminate any values apart from the negative and specific ones (described in Chapter 3)
  summary(data_v5$books_age10)
  boxplot(data_v5$books_age10)#no visible) outliers (probably because of low scale 1-5 we do not even expect outliers to occur)
  hist(data_v5$books_age10, main="Histogram of Books around when 10 years old", xlab="Level (1 lowest, 5 highest)")
#This variables should be well translated to the levels described in the original mapping the the data set

#eurod - depression levels accroding to Eurostat
  #This variabel is not linear (it does not mean that person with score 10 is twice as depressed as a person with score 5)
summary(data_v5$eurod)#we are geting negative values, hence we have to adjust the filter at the start of our script
#describe the summary, for the purposes of thi thesis we can say that the higher the level, the more depressed the person is
hist(data_v5$eurod, main="Histogram of Depression Levels", xlab="Levels (1 lowest, 12 highest)")


  #eurod_counts<-table(data_v5[data_v5$female==1,"eurod"],data_v5[data_v5$female==0,"eurod"])
  #barplot(table(data_v5[data_v5$female==1,"eurod"],data_v5[data_v5$female==0,"eurod"]), main="Histogram of Depression Levels Combined", xlab="Levels (1 lowest, 12 highest)")

#eurod - depression statistics, accounted for outliers
#some people tend to identify themselves as maybe mor edepressed than they really are and hence this method of self-evaluation might skew the data
boxplot(data_v5$eurod)
IQR_eurod_upperbound<-3/2*(summary(data_v5$eurod)[5]-summary(data_v5$eurod)[2])+summary(data_v5$eurod)[5]
print(IQR_eurod_upperbound)
print(count(data_v5[data_v5$eurod>=IQR_eurod_upperbound,]))
summary(data_v5$eurod)#Since the upper bound was identified as 6 we have to adjust the filzter at the start of our script

#Now lets talk about differences between genders
summary(data_v5[data_v5$female==1&data_v5$eurod>=0,]$eurod)
summary(data_v5[data_v5$female==0&data_v5$eurod>=0,]$eurod)
count_female<-count(data_v5[data_v5$female==1,])
count_male<-count(data_v5[data_v5$female==0,])
count_female+count_male
summary(data_v5[data_v5$female==1,]$bro010_mod)
hist(data_v5[data_v5$female==1&data_v5$eurod>=0,]$eurod, xlab="Eurod grade", breaks=10, main="Histogram of EUROD - Female")
hist(data_v5[data_v5$female==0&data_v5$eurod>=0,]$eurod, xlab="Eurod grade", breaks=10, main="Histogram of EUROD - Male")
#generally we can say that women tend do describe themselves as more depressed than men, it can be caused by two reasons
#1-men are really less depressed than women
#2-men tend to hide that they are depressed and so they report lower numbers
#What is the relationship between age and depression? Correlation
cor(data_v5$eurod, data_v5$age)#not significant




#bro10_mod - Drinking represented by number of day a person drinks during the week, we can say that it is numeric
boxplot(data_v5$br010_mod)#some negative values exist and hence we have to adjust the filter of our data accordingly
summary(data_v5$br010_mod)
hist(data_v5$br010_mod, main=" ", xlab="Frequency during last three months", breaks = 6)

count(data_v5[data_v5$br010_mod>=6,])
count(data_v5[data_v5$br010_mod==1,])


#years of education 
summary(data$eduyears_mod) #negative values --> filter
summary(data_v5$eduyears_mod)
boxplot(data_v5$eduyears_mod)
hist(data_v5$eduyears_mod, main=" ", xlab=" ", breaks = 5)

#Number of children around the respondent (ch001)
summary(data$ch001_)#This variable contains negative values hence the filter at the start of the code has to be adjusted
summary(data_v5$ch001_)
boxplot(data_v5$ch001_)#There are some outliers, but are essentially reasonable and should not be caused by an error in the measurement
#Most people however have around 2 children
hist(data_v5$ch001_, main="How many chidlren do you have?", xlab="Number of children")

#ch021_mod - number of grandchildren
summary(data_v5$ch021_mod) #variable contains negative values again and so we have to adjust the filter
boxplot(data_v5$ch021_mod) #significant outliers, we have to establish upper bound and get rid of the outliers
#finding the upper bound
IQR_eurod_upperbound_ch021_mod<-3/2*(summary(data_v5$ch021_mod)[5]-summary(data_v5$ch021_mod)[2])+summary(data_v5$ch021_mod)[5]
print(IQR_eurod_upperbound_ch021_mod)#we find out that the upper bound is 9.5 and so we have to once again adjust the filter for our data
hist(data_v5$ch021_mod, main="How many grandchidlren do you have?", xlab="Number of grandchildren")








#MODELLING
#Logit and Probit - We have decided to use only logit since we expect that the extremes are possible and rather probable
#We are more interested in depicting the extremes rather than the body of the probability distribution (for modelling the body of distribution probit model is more suitable)
model_test_logit<-glm(numeracy_dummy ~ age + female  + books_age10 +eurod + br010_mod + eduyears_mod + ch001_ + ch021_mod ,data=data_v5, family="binomial")
summary(model_test_logit)


stargazer(model_test_logit)

#since the logit transforms the distribution using probability, we have to transform the estimates back in order to obtain a MARGINAL EFFECT of respective independetn variables
#Creating function to derive marginal effect
#Marginal effect at mean for respective parameters

# Extract betas from the model
betas = t(data.frame(coef(model_test_logit)))
const = betas[1]
b_age = betas[2]
b_female = betas[3]
b_books_age10 = betas[4]
b_eurod=betas[5]
b_bro010_mod=betas[6]
b_eduyears_mod=betas[7]
b_ch001_=betas[8]
b_ch021_mod=betas[9]
age + female  + books_age10 +eurod + br010_mod + eduyears_mod + ch001_ + ch021_mod

# Calculate mean values of x
x_age = mean(data_v5$age)
x_female = mean(data_v5$female)
x_books_age10 = mean(data_v5$books_age10)
x_eurod = mean(data_v5$eurod)
x_bro010_mod = mean(data_v5$br010_mod)
x_eduyears_mod = mean(data_v5$eduyears_mod)
x_ch001_ = mean(data_v5$ch001_)
x_ch021_mod = mean(data_v5$ch021_mod)

# Calculate x * beta at mean values of x
mean = const + b_age*x_age + b_female*x_female + b_books_age10*x_books_age10 + b_eurod*x_eurod +b_bro010_mod*x_bro010_mod + b_eduyears_mod*x_eduyears_mod + b_ch001_*x_ch001_ + b_ch021_mod*x_ch021_mod  # this is the value of x'b evaluated at mean(x)

LAMBDA <- function(x) { 1 / (1 + exp(-x))}
# Marginal effects, these values can be interpreted as marginal effect, which means that if x changes so does the dependent variable according to these slopes

age_slope = LAMBDA(mean)*(b_age)
female_slope = LAMBDA(mean)*(b_female)
books_age10_slope = LAMBDA(mean)*(b_books_age10)
eurod_slope= LAMBDA(mean)*(b_eurod)
bro010_mod_slope= LAMBDA(mean)*(b_bro010_mod)
eduyears_mod_slope= LAMBDA(mean)*(b_eduyears_mod)
ch001_slope= LAMBDA(mean)*(b_ch001_)
ch021_mod_slope= LAMBDA(mean)*(b_ch021_mod)

marginal_effects<-c(age_slope,female_slope,books_age10_slope,eurod_slope,bro010_mod_slope,eduyears_mod_slope, ch001_slope, ch021_mod_slope)
summary(marginal_effects)



print(age_slope)
print(female_slope)
print(books_age10_slope)
print(eurod_slope)
print(bro010_mod_slope)
print(eduyears_mod_slope)
print(ch001_slope)
print(ch021_mod_slope)