library(robustbase) library(ggplot2) library(dplyr) library(corrplot) library(zoo) library("SparkR") library(ggplot2) library(plm) library(lmtest) library(ecm) library(car) library(stargazer) #loading data rm(list = ls()) #Loading the data data<-load(file="easySHARE_rel7_1_0.rda") data<-easySHARE_rel7_1_0 View(data) #descriptive statistics #list of relevant variables var_model<-c("numeracy_1","age", "female","books_age10", "eurod", "br010_mod", "eduyears_mod", "ch001_", "ch021_mod", "numeracy_dummy", "wave" ) #selecting data from waves 5 data_var<-select(data, c("numeracy_1","age", "female","books_age10", "eurod", "br010_mod", "eduyears_mod", "ch001_", "ch021_mod","wave")) summary(data_var) data_v5<-data_var[data_var$wave==5&data_var$books_age10>0&data_var$numeracy_1>=0&data_var$eurod>=0&data_var$eurod<=6&data_var$ch001_>0&data_var$age>=50&data_var$br010_mod>0&data_var$ch021_mod>0&data_var$ch021_mod<9.5&data_var$eduyears_mod>0,] data_v5$numeracy_dummy<-c(data_v5$numeracy_1>=4) summary(data_var) summary(data_v5) count(data_v5) stargazer(data_v5) #descriptive statistics: #1 summary (mean, variation max, ,min) #2 boxplot(data), check outliers #3 plot the data in histogram or graph again age or other variable #4 describe data and add analytical thoughts, other sources #lets check correlation matrix to identify any relationship between our potentially independent variables cor_matrix<-cor(data_v5[,-c(1,10)]) stargazer(cor_matrix, title="Correlation Matrix") # we did not identify any significant relationship # interesing relationships are books_age10 vs.eduyears_mod, ch001_ vs ch021_mod and ch021_mod vs age) print(cor_matrix) cor(data_v5) round(cor(data_v5),3) #Dependent variable numeracy summary(data_v5$numeracy_1) boxplot(data_v5$numeracy_1)#outliers are reasonable after omitting negative values, hence no need to eliminate any title(main="Numeracy_1") hist(data_v5$numeracy_1, main="") title(main="Numeracy_1") #Numeracy dummy - transformation of the variable to better reflect the needs of proposed logit model (0/1) summary(data_v5$numeracy_dummy) barplot(table(data_v5$numeracy_dummy), main="Numeracy Pass/Fail Distribution",xlab="Number of Fails and Succeses in the test",names=c("Fail", "Pass")) #Calculating some statistics about the dependent numeracy variable, around 51% of the respondents are considered to "fail" the test count(data_v5[data_v5$numeracy_1==0,]) count(data_v5[data_v5$numeracy_1==1,]) count(data_v5[data_v5$numeracy_1==2,]) count(data_v5[data_v5$numeracy_1==3,]) count(data_v5[data_v5$numeracy_1==4,]) count(data_v5[data_v5$numeracy_1==5,]) sumar<-summary(as.factor(data_v5$numeracy_1)) print(sumar) summary(data_v5$numeracy_1) hist(data_v5$numeracy_1) soucet<-sumar[1]+sumar[2]+sumar[3] soucet/count(data_v5) #50 něco % má 4 a menší sumar[1] #Age of the respondetns summary(data_v5$age) #We found one observations that is smaller than 15 and hence we had to apply new condition for initial data selection boxplot(data_v5$age) title(main="Boxplot of Age") #we set an abritrary treshold to determine whether a respondent is "old enough to participate in our study #Hence we set minimal age to 50 years hist(data_v5$age, main="Histogram of Age", xlab="Age in years") count(data_v5[data_v5$age>=90,]) count(data_var[data_var$eduyears_mod<9,]) #Gender variable (female), since this is a dummy variable we do not expect any outliers to be present #creating some descriptive statistics to bring some perspective to the variable summary(data_v5$female)#from the summary it is age_bar<-barplot(table(data_v5$female), main="Gender Distribution", xlab="Number of males and females", names=c("Male", "Female"), col=c("blue2", "red2")) text(age_bar, 2000, table(data_v5$female),cex=1,pos=3) #Books_age_10 - amouunt of book when 10 years old, it is not linear #since it is the variable of interest #no need to eliminate any values apart from the negative and specific ones (described in Chapter 3) summary(data_v5$books_age10) boxplot(data_v5$books_age10)#no visible) outliers (probably because of low scale 1-5 we do not even expect outliers to occur) hist(data_v5$books_age10, main="Histogram of Books around when 10 years old", xlab="Level (1 lowest, 5 highest)") #This variables should be well translated to the levels described in the original mapping the the data set #eurod - depression levels accroding to Eurostat #This variabel is not linear (it does not mean that person with score 10 is twice as depressed as a person with score 5) summary(data_v5$eurod)#we are geting negative values, hence we have to adjust the filter at the start of our script #describe the summary, for the purposes of thi thesis we can say that the higher the level, the more depressed the person is hist(data_v5$eurod, main="Histogram of Depression Levels", xlab="Levels (1 lowest, 12 highest)") #eurod_counts<-table(data_v5[data_v5$female==1,"eurod"],data_v5[data_v5$female==0,"eurod"]) #barplot(table(data_v5[data_v5$female==1,"eurod"],data_v5[data_v5$female==0,"eurod"]), main="Histogram of Depression Levels Combined", xlab="Levels (1 lowest, 12 highest)") #eurod - depression statistics, accounted for outliers #some people tend to identify themselves as maybe mor edepressed than they really are and hence this method of self-evaluation might skew the data boxplot(data_v5$eurod) IQR_eurod_upperbound<-3/2*(summary(data_v5$eurod)[5]-summary(data_v5$eurod)[2])+summary(data_v5$eurod)[5] print(IQR_eurod_upperbound) print(count(data_v5[data_v5$eurod>=IQR_eurod_upperbound,])) summary(data_v5$eurod)#Since the upper bound was identified as 6 we have to adjust the filzter at the start of our script #Now lets talk about differences between genders summary(data_v5[data_v5$female==1&data_v5$eurod>=0,]$eurod) summary(data_v5[data_v5$female==0&data_v5$eurod>=0,]$eurod) count_female<-count(data_v5[data_v5$female==1,]) count_male<-count(data_v5[data_v5$female==0,]) count_female+count_male summary(data_v5[data_v5$female==1,]$bro010_mod) hist(data_v5[data_v5$female==1&data_v5$eurod>=0,]$eurod, xlab="Eurod grade", breaks=10, main="Histogram of EUROD - Female") hist(data_v5[data_v5$female==0&data_v5$eurod>=0,]$eurod, xlab="Eurod grade", breaks=10, main="Histogram of EUROD - Male") #generally we can say that women tend do describe themselves as more depressed than men, it can be caused by two reasons #1-men are really less depressed than women #2-men tend to hide that they are depressed and so they report lower numbers #What is the relationship between age and depression? Correlation cor(data_v5$eurod, data_v5$age)#not significant #bro10_mod - Drinking represented by number of day a person drinks during the week, we can say that it is numeric boxplot(data_v5$br010_mod)#some negative values exist and hence we have to adjust the filter of our data accordingly summary(data_v5$br010_mod) hist(data_v5$br010_mod, main=" ", xlab="Frequency during last three months", breaks = 6) count(data_v5[data_v5$br010_mod>=6,]) count(data_v5[data_v5$br010_mod==1,]) #years of education summary(data$eduyears_mod) #negative values --> filter summary(data_v5$eduyears_mod) boxplot(data_v5$eduyears_mod) hist(data_v5$eduyears_mod, main=" ", xlab=" ", breaks = 5) #Number of children around the respondent (ch001) summary(data$ch001_)#This variable contains negative values hence the filter at the start of the code has to be adjusted summary(data_v5$ch001_) boxplot(data_v5$ch001_)#There are some outliers, but are essentially reasonable and should not be caused by an error in the measurement #Most people however have around 2 children hist(data_v5$ch001_, main="How many chidlren do you have?", xlab="Number of children") #ch021_mod - number of grandchildren summary(data_v5$ch021_mod) #variable contains negative values again and so we have to adjust the filter boxplot(data_v5$ch021_mod) #significant outliers, we have to establish upper bound and get rid of the outliers #finding the upper bound IQR_eurod_upperbound_ch021_mod<-3/2*(summary(data_v5$ch021_mod)[5]-summary(data_v5$ch021_mod)[2])+summary(data_v5$ch021_mod)[5] print(IQR_eurod_upperbound_ch021_mod)#we find out that the upper bound is 9.5 and so we have to once again adjust the filter for our data hist(data_v5$ch021_mod, main="How many grandchidlren do you have?", xlab="Number of grandchildren") #MODELLING #Logit and Probit - We have decided to use only logit since we expect that the extremes are possible and rather probable #We are more interested in depicting the extremes rather than the body of the probability distribution (for modelling the body of distribution probit model is more suitable) model_test_logit<-glm(numeracy_dummy ~ age + female + books_age10 +eurod + br010_mod + eduyears_mod + ch001_ + ch021_mod ,data=data_v5, family="binomial") summary(model_test_logit) stargazer(model_test_logit) #since the logit transforms the distribution using probability, we have to transform the estimates back in order to obtain a MARGINAL EFFECT of respective independetn variables #Creating function to derive marginal effect #Marginal effect at mean for respective parameters # Extract betas from the model betas = t(data.frame(coef(model_test_logit))) const = betas[1] b_age = betas[2] b_female = betas[3] b_books_age10 = betas[4] b_eurod=betas[5] b_bro010_mod=betas[6] b_eduyears_mod=betas[7] b_ch001_=betas[8] b_ch021_mod=betas[9] age + female + books_age10 +eurod + br010_mod + eduyears_mod + ch001_ + ch021_mod # Calculate mean values of x x_age = mean(data_v5$age) x_female = mean(data_v5$female) x_books_age10 = mean(data_v5$books_age10) x_eurod = mean(data_v5$eurod) x_bro010_mod = mean(data_v5$br010_mod) x_eduyears_mod = mean(data_v5$eduyears_mod) x_ch001_ = mean(data_v5$ch001_) x_ch021_mod = mean(data_v5$ch021_mod) # Calculate x * beta at mean values of x mean = const + b_age*x_age + b_female*x_female + b_books_age10*x_books_age10 + b_eurod*x_eurod +b_bro010_mod*x_bro010_mod + b_eduyears_mod*x_eduyears_mod + b_ch001_*x_ch001_ + b_ch021_mod*x_ch021_mod # this is the value of x'b evaluated at mean(x) LAMBDA <- function(x) { 1 / (1 + exp(-x))} # Marginal effects, these values can be interpreted as marginal effect, which means that if x changes so does the dependent variable according to these slopes age_slope = LAMBDA(mean)*(b_age) female_slope = LAMBDA(mean)*(b_female) books_age10_slope = LAMBDA(mean)*(b_books_age10) eurod_slope= LAMBDA(mean)*(b_eurod) bro010_mod_slope= LAMBDA(mean)*(b_bro010_mod) eduyears_mod_slope= LAMBDA(mean)*(b_eduyears_mod) ch001_slope= LAMBDA(mean)*(b_ch001_) ch021_mod_slope= LAMBDA(mean)*(b_ch021_mod) marginal_effects<-c(age_slope,female_slope,books_age10_slope,eurod_slope,bro010_mod_slope,eduyears_mod_slope, ch001_slope, ch021_mod_slope) summary(marginal_effects) print(age_slope) print(female_slope) print(books_age10_slope) print(eurod_slope) print(bro010_mod_slope) print(eduyears_mod_slope) print(ch001_slope) print(ch021_mod_slope)