####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ### First let's install and import the packages needed install.packages("sfsmisc") install.packages("lawstat") install.packages("stargazer") install.packages("RCurl") library(MASS) library(sfsmisc) library(lawstat) library(normtest) library(lmtest) library(stargazer) library(sandwich) library(readr) ### Now load the data and ENTER YOUR PATH TO THE SOURCE FILE CALLED Czechoslovak_New_Wave_analysis_data.csv Czechoslovak_New_Wave_analysis_data <- read.csv2("C:/Vojtech/Skola/baka/Czechoslovak_New_Wave_analysis_data.csv", encoding="UTF-8") View(Czechoslovak_New_Wave_analysis_data) ### Omit NA values sum(is.na(Czechoslovak_New_Wave_analysis_data)) data <- na.omit(Czechoslovak_New_Wave_analysis_data) sum(is.na(data)) View(data) ### Upload New Wave titles file and again ENTER YOUR PATH TO THE SOURCE FILLE CALLED Czechoslovak_New_Wave_titles.csv Czechoslovak_New_Wave_movie_titles <- read_delim("C:/Vojtech/Skola/baka/Czechoslovak_New_Wave_movie_titles.csv", ";", escape_double = FALSE, trim_ws = TRUE) View(Czechoslovak_New_Wave_movie_titles) ### Create another variable filming stage from exterior days and atelier days data$filming_stage <- data$exterior_days + data$atelier_days ### Create a subsets from the main dataset analysisdata and split it by the second uploaded file New_Wave_titles titles_New_Wave_in_dataset <- c(intersect(data$czech_movie_name, Czechoslovak_New_Wave_movie_titles$movie_title)) positions_non_New_Wave <- is.na(match(data$czech_movie_name, titles_New_Wave_in_dataset)) positions_New_Wave <- !is.na(match(data$czech_movie_name, titles_New_Wave_in_dataset)) New_Wave <- subset(data, positions_New_Wave) non_New_Wave <- subset(data, positions_non_New_Wave) View(New_Wave) View(non_New_Wave) ### Now let's proceed to the data, first create dummies for further analysis data$formatdummy <- ifelse(data$format == "wide screen", 1, 0) data$colordummy <- ifelse(data$color == "color+B&W", 2, ifelse(data$color == "color", 1, 0)) data$New_Wavedummy <- ifelse(positions_New_Wave == TRUE, 1, 0) ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ### The first part of preparing for the analysis ended, now let's begin with creating boxplots for exterior NWexter <- New_Wave$exterior_days nNWexter <- non_New_Wave$exterior_days boxplot(NWexter, nNWexter, main = "Comparison exterior days", at = c(1,2), xlab = "days in exterior", names = c("New Wave", "non New Wave"), col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### The data are visualised, now there is necessary to test the data statistically ### Let's test variable exterior_days for normality shapiro.test(data$exterior_days) ### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis ### is rejected and we suggest that the variable exterior days is not normally distributed ### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave) ### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test wilcox.test(exterior_days ~ New_Wavedummy, data = data) ### The two samples are different (from different populations) ### For curiosity and assurance we can make another test (Brunner-Munzel Test) brunner.munzel.test(New_Wave$exterior_days, non_New_Wave$exterior_days, alternative = "g") ### At 0,05 significance level, we conclude that the exterior days data of New Wave ### and Non New Wave are nonidentical populations (the same result as in the Mann-Whitney Test) ####################################################################################### ####################################################################################### ####################################################################################### ### Now let's create boxplots for atelier NWatel <- New_Wave$atelier_days nNWatel <- non_New_Wave$atelier_days boxplot(NWatel, nNWatel, main = "Comparison atelier days", at = c(1,2), xlab = "days in atelier", names = c("New Wave", "non New Wave"), col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### The data are visualised, now there is necessary to test the data statistically ### Let's test variable atelier_days for normality shapiro.test(data$atelier_days) ### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis ### is rejected and we suggest that the variable atelier days is not normally distributed ### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave) ### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test wilcox.test(atelier_days ~ New_Wavedummy, data = data) ### The two samples are different (from different populations) ### For curiosity and assurance we can make another test (Brunner-Munzel Test) brunner.munzel.test(New_Wave$atelier_days, non_New_Wave$atelier_days, alternative = "l") ### At 0,05 significance level, we conclude that the atelier days data of New Wave ### and Non New Wave are nonidentical populations (the same result as in the Mann-Whitney Test) ####################################################################################### ####################################################################################### ####################################################################################### ### Now let's create boxplots for both exterior and atelier par(mar= c(4, 6, 2, 1) + 0.1) boxplot(NWexter, nNWexter, NWatel, nNWatel, main = "exterior and atelier in days", at = c(1,2,4,5), names = c("NW exterior", "nNW exterior", "NW atelier", "nNW atelier"), las = 2, col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### Further analysis for max, min mean(New_Wave$exterior_days) mean(New_Wave$atelier_days) New_Wave$czech_movie_name[which.max(NWexter)] New_Wave$czech_movie_name[which.max(NWatel)] max(NWexter) max(NWatel) non_New_Wave$czech_movie_name[which.max(nNWexter)] non_New_Wave$czech_movie_name[which.max(nNWatel)] max(nNWexter) max(nNWatel) ####################################################################################### ####################################################################################### ####################################################################################### ### Now let's create boxplots for filming stage New_Wave$filming_stage <- New_Wave$atelier_days + New_Wave$exterior_days non_New_Wave$filming_stage <- non_New_Wave$atelier_days + non_New_Wave$exterior_days NWfilming <- New_Wave$filming_stage nNWfilming <- non_New_Wave$filming_stage dev.off() boxplot(NWfilming, nNWfilming, main = "Comparison filming stage", at = c(1,2), xlab = "number of days of filming stage", names = c("New Wave", "non New Wave"), col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### testing filming stage for normality shapiro.test(data$filming_stage) ### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis ### is rejected and we suggest that the variable filming stage is not normally distributed ### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave) ### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test wilcox.test(filming_stage ~ New_Wavedummy, data = data) ### We failed to reject the null hypothesis, thus the two samples are not different (from same populations) ### For curiosity and assurance we can make another test (Brunner-Munzel Test) brunner.munzel.test(New_Wave$filming_stage, non_New_Wave$filming_stage, alternative = "t") ### At 0,05 significance level, we conclude that the filming stage data of New Wave ### and Non New Wave are identical populations (the same result as in the Mann-Whitney Test) ####################################################################################### ####################################################################################### ####################################################################################### ### Now let's create boxplots for production time NWprod <- New_Wave$production_days nNWprod <- non_New_Wave$production_days boxplot(NWprod, nNWprod, main = "Comparison production days", at = c(1,2), xlab = "number of days of production", names = c("New Wave", "non New Wave"), col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### testing production days for normality shapiro.test(data$production_days) ### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis ### is rejected and we suggest that the variable production days is not normally distributed ### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave) ### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test wilcox.test(production_days ~ New_Wavedummy, data = data) ### We failed to reject the null hypothesis, thus the two samples are not different (from same populations) ### For curiosity and assurance we can make another test (Brunner-Munzel Test) brunner.munzel.test(New_Wave$production_days, non_New_Wave$production_days, alternative = "t") ### At 0,05 significance level, we conclude that the production days data of New Wave ### and Non New Wave are identical populations (the same result as in the Mann-Whitney Test) ####################################################################################### ####################################################################################### ####################################################################################### ### Now let's create boxplots for production costs NWcosts <- New_Wave$costs nNWcosts <- non_New_Wave$costs boxplot(NWcosts, nNWcosts, main = "Comparison of production costs", at = c(1,2), xlab = "costs in thousands", names = c("New Wave", "non New Wave"), col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### testing production costs for normality shapiro.test(data$costs) ### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis ### is rejected and we suggest that the variable production costs is not normally distributed ### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave) ### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test wilcox.test(costs ~ New_Wavedummy, data = data) ### We failed to reject the null hypothesis, thus the two samples are not different (from same populations) ### For curiosity and assurance we can make another test (Brunner-Munzel Test) brunner.munzel.test(New_Wave$costs, non_New_Wave$costs, alternative = "t") ### At 0,05 significance level, we conclude that the production costs data of New Wave ### and Non New Wave are identical populations (the same result as in the Mann-Whitney Test) ### Further analysis for max, min New_Wave$czech_movie_name[which.max(NWcosts)] non_New_Wave$czech_movie_name[which.max(nNWcosts)] max(New_Wave$costs) max(non_New_Wave$costs) n1 <- length(New_Wave$czech_movie_name) n2 <- length(non_New_Wave$czech_movie_name) New_Wave$czech_movie_name[which(New_Wave$costs == sort(New_Wave$costs, partial = n1 - 1)[n1 - 1])] non_New_Wave$czech_movie_name[which(non_New_Wave$costs == sort(non_New_Wave$costs, partial = n2 - 1)[n2 - 1])] sort(New_Wave$costs)[n1 - 1] sort(non_New_Wave$costs)[n2 - 1] New_Wave$czech_movie_name[which.min(New_Wave$costs)] non_New_Wave$czech_movie_name[which.min(non_New_Wave$costs)] min(New_Wave$costs) min(non_New_Wave$costs) ####################################################################################### ####################################################################################### ####################################################################################### ### Now let's create boxplots for attendance NWattendance <- New_Wave$attendance nNWattendance <- non_New_Wave$attendance boxplot(NWattendance, nNWattendance, main = "Comparison of attendance", at = c(1,2), xlab = "attendance in thousands", names = c("New Wave", "non New Wave"), col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### testing attendance for normality shapiro.test(data$attendance) ### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis ### is rejected and we suggest that the variable attendance is not normally distributed ### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave) ### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test wilcox.test(attendance ~ New_Wavedummy, data = data) ### The two samples are different (from different populations) ### For curiosity and assurance we can make another test (Brunner-Munzel Test) brunner.munzel.test(New_Wave$attendance, non_New_Wave$attendance, alternative = "t") ### At 0,05 significance level, we conclude that the attendance data of New Wave ### and Non New Wave are nonidentical populations (the same result as in the Mann-Whitney Test) ### Further analysis for max, min New_Wave$czech_movie_name[which(New_Wave$attendance == 0)] non_New_Wave$czech_movie_name[which(non_New_Wave$attendance == 0)] New_Wave$czech_movie_name[which.max(New_Wave$attendance)] New_Wave$czech_movie_name[which(New_Wave$attendance == sort(New_Wave$attendance, partial = n1 - 1)[n1 - 1])] max(New_Wave$attendance) sort(New_Wave$attendance)[n1 - 1] non_New_Wave$czech_movie_name[which.max(non_New_Wave$attendance)] non_New_Wave$czech_movie_name[which(non_New_Wave$attendance == sort(non_New_Wave$attendance, partial = n1 - 1)[n1 - 1])] max(non_New_Wave$attendance) sort(non_New_Wave$attendance)[n2 - 1] ####################################################################################### ####################################################################################### ####################################################################################### ### Now let's create boxplots for gross sales NWsales <- New_Wave$sales nNWsales <- non_New_Wave$sales boxplot(NWsales, nNWsales, main = "Comparison of sales", at = c(1,2), xlab = "gross sales in thousands", names = c("New Wave", "non New Wave"), col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### testing gross sales for normality shapiro.test(data$sales) ### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis ### is rejected and we suggest that the variable gross sales is not normally distributed ### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave) ### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test wilcox.test(sales ~ New_Wavedummy, data = data) ### The two samples are different (from different populations) ### For curiosity and assurance we can make another test (Brunner-Munzel Test) brunner.munzel.test(New_Wave$sales, non_New_Wave$sales, alternative = "t") ### At 0,05 significance level, we conclude that the gross sales data of New Wave ### and Non New Wave are nonidentical populations (the same result as in the Mann-Whitney Test) ####################################################################################### ####################################################################################### ####################################################################################### ### Now let's create boxplots for gains NWgains <- New_Wave$gains nNWgains <- non_New_Wave$gains boxplot(NWgains, nNWgains, main = "Comparison of gains", at = c(1,2), xlab = "gains in thousands", names = c("New Wave", "non New Wave"), col = c('cornflowerblue', 'darkorange'), border = "brown", horizontal = TRUE, notch = TRUE ) ### testing gains for normality shapiro.test(data$gains) ### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis ### is rejected and we suggest that the variable gains is not normally distributed ### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave) ### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test wilcox.test(gains ~ New_Wavedummy, data = data) ### The two samples are different (from different populations) ### For curiosity and assurance we can make another test (Brunner-Munzel Test) brunner.munzel.test(New_Wave$gains, non_New_Wave$gains, alternative = "t") ### At 0,05 significance level, we conclude that the gains of New Wave ### and Non New Wave are nonidentical populations (the same result as in the Brunner-Munzel Test) ### further analysis for max,min New_Wave$czech_movie_name[which.min(New_Wave$gains)] New_Wave$czech_movie_name[which(New_Wave$gains == sort(New_Wave$gains)[2])] min(New_Wave$gains) sort(New_Wave$gains)[2] non_New_Wave$czech_movie_name[which.min(non_New_Wave$gains)] non_New_Wave$czech_movie_name[which(non_New_Wave$gains == sort(non_New_Wave$gains)[2])] min(non_New_Wave$gains) sort(non_New_Wave$gains)[2] New_Wave$czech_movie_name[which.max(New_Wave$gains)] New_Wave$czech_movie_name[which(New_Wave$gains == sort(New_Wave$gains)[n1 - 1])] max(New_Wave$gains) sort(New_Wave$gains)[n1 - 1] non_New_Wave$czech_movie_name[which.max(non_New_Wave$gains)] non_New_Wave$czech_movie_name[which(non_New_Wave$gains == sort(non_New_Wave$gains)[n2 - 1])] max(non_New_Wave$gains) sort(non_New_Wave$gains)[n2 - 1] mean(New_Wave$gains) mean(non_New_Wave$gains) ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ### MODEL for production costs ### classic OLS regression model ols <- lm(costs ~ production_days + atelier_days + exterior_days + New_Wavedummy + colordummy + formatdummy, data = data) summary(ols) ### Let's test the model with Breusch-Pagan Test against heteroskedasticity bptest(ols) ### Heteroskedasticity is definitelly present, therefore another models should be created, because the ### results of standard errors in the regression are unreliable. ### a robust standard errors are one solution for dealing with heteroskedasticity ### The author used the robust t test for creating the robust st.err. and for knowing ### whether the variables are statistically significant coeftest(ols, vcov = vcovHC(ols, type = "HC0")) ### to test whether the whole model is statistically significant author use robust F test ols2 <- lm(costs ~ production_days, data = data) waldtest(ols2, ols, vcov = vcovHC(ols, type = "HC0")) ### the model is statistically significant ### another model using fgls that deals with heteroskedasticity u_hat <- resid(ols) u_hat2 <- log((u_hat)^2) reg_u_hat2 <- lm(u_hat2 ~ production_days + atelier_days + exterior_days + New_Wavedummy + colordummy + formatdummy, data = data) g_hat <- fitted(reg_u_hat2) g_hat2 <- exp(g_hat) W <- 1/g_hat2 fgls <- lm(costs ~ production_days + atelier_days + exterior_days + New_Wavedummy + colordummy + formatdummy, data = data, weights = W) summary(fgls) ### Comparison of coefficients ONLY, NOT STANDARD ERRORS stargazer(ols, fgls, column.labels = c("OLS", "FGLS"), type = "text", keep.stat = c("n", "rsq")) ### Final comparisons of format and color a <- table(New_Wave$format) b <- table(non_New_Wave$format) a_values <- as.numeric(a) NWratio_widescreen_classic <- a_values[2]/a_values[1] NWratio_widescreen_classic b_values <- as.numeric(b) nNWratio_widescreen_classic <- b_values[2]/b_values[1] nNWratio_widescreen_classic ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### ####################################################################################### #######################################################################################