#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################

### First let's install and import the packages needed

install.packages("sfsmisc")
install.packages("lawstat")
install.packages("stargazer")
install.packages("RCurl")
library(MASS)
library(sfsmisc)
library(lawstat)
library(normtest)
library(lmtest)
library(stargazer)
library(sandwich)
library(readr)

### Now load the data and ENTER YOUR PATH TO THE SOURCE FILE CALLED Czechoslovak_New_Wave_analysis_data.csv

Czechoslovak_New_Wave_analysis_data <- read.csv2("C:/Vojtech/Skola/baka/Czechoslovak_New_Wave_analysis_data.csv", encoding="UTF-8")

View(Czechoslovak_New_Wave_analysis_data)

### Omit NA values

sum(is.na(Czechoslovak_New_Wave_analysis_data))
data <- na.omit(Czechoslovak_New_Wave_analysis_data)
sum(is.na(data))
View(data)

### Upload New Wave titles file and again ENTER YOUR PATH TO THE SOURCE FILLE CALLED Czechoslovak_New_Wave_titles.csv

Czechoslovak_New_Wave_movie_titles <- read_delim("C:/Vojtech/Skola/baka/Czechoslovak_New_Wave_movie_titles.csv", 
                                                        ";", escape_double = FALSE, trim_ws = TRUE)

View(Czechoslovak_New_Wave_movie_titles)

### Create another variable filming stage  from exterior days and atelier days

data$filming_stage <- data$exterior_days + data$atelier_days

### Create a subsets from the main dataset analysisdata and split it by the second uploaded file New_Wave_titles

titles_New_Wave_in_dataset <- c(intersect(data$czech_movie_name, Czechoslovak_New_Wave_movie_titles$movie_title))

positions_non_New_Wave <- is.na(match(data$czech_movie_name, titles_New_Wave_in_dataset))

positions_New_Wave <- !is.na(match(data$czech_movie_name, titles_New_Wave_in_dataset))

New_Wave <- subset(data, positions_New_Wave)

non_New_Wave <- subset(data, positions_non_New_Wave)

View(New_Wave)

View(non_New_Wave)

### Now let's proceed to the data, first create dummies for further analysis

data$formatdummy <- ifelse(data$format == "wide screen", 1, 0)
data$colordummy <- ifelse(data$color == "color+B&W", 2, ifelse(data$color == "color", 1, 0))
data$New_Wavedummy <- ifelse(positions_New_Wave == TRUE, 1, 0)

#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################

### The first part of preparing for the analysis ended, now let's begin with creating boxplots for exterior

NWexter <- New_Wave$exterior_days
nNWexter <- non_New_Wave$exterior_days

boxplot(NWexter, nNWexter,
        main = "Comparison exterior days",
        at = c(1,2),
        xlab = "days in exterior",
        names = c("New Wave", "non New Wave"),
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### The data are visualised, now there is necessary to test the data statistically

### Let's test variable exterior_days for normality

shapiro.test(data$exterior_days)

### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis
### is rejected and we suggest that the variable exterior days is not normally distributed

### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave)
### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test

wilcox.test(exterior_days ~ New_Wavedummy, data = data)

### The two samples are different (from different populations)

### For curiosity and assurance we can make another test (Brunner-Munzel Test)

brunner.munzel.test(New_Wave$exterior_days, non_New_Wave$exterior_days, alternative = "g")

### At 0,05 significance level, we conclude that the exterior days data of New Wave
### and Non New Wave are nonidentical populations (the same result as in the Mann-Whitney Test)

#######################################################################################
#######################################################################################
#######################################################################################

### Now let's create boxplots for atelier

NWatel <- New_Wave$atelier_days
nNWatel <- non_New_Wave$atelier_days

boxplot(NWatel, nNWatel,
        main = "Comparison atelier days",
        at = c(1,2),
        xlab = "days in atelier",
        names = c("New Wave", "non New Wave"),
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### The data are visualised, now there is necessary to test the data statistically

### Let's test variable atelier_days for normality

shapiro.test(data$atelier_days)

### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis
### is rejected and we suggest that the variable atelier days is not normally distributed

### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave)
### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test

wilcox.test(atelier_days ~ New_Wavedummy, data = data)

### The two samples are different (from different populations)

### For curiosity and assurance we can make another test (Brunner-Munzel Test)

brunner.munzel.test(New_Wave$atelier_days, non_New_Wave$atelier_days, alternative = "l")

### At 0,05 significance level, we conclude that the atelier days data of New Wave
### and Non New Wave are nonidentical populations (the same result as in the Mann-Whitney Test)

#######################################################################################
#######################################################################################
#######################################################################################

### Now let's create boxplots for both exterior and atelier

par(mar= c(4, 6, 2, 1) + 0.1)

boxplot(NWexter, nNWexter, NWatel, nNWatel,
        main = "exterior and atelier in days",
        at = c(1,2,4,5),
        names = c("NW exterior", "nNW exterior", "NW atelier", "nNW atelier"),
        las = 2,
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### Further analysis for max, min

mean(New_Wave$exterior_days)
mean(New_Wave$atelier_days)


New_Wave$czech_movie_name[which.max(NWexter)]
New_Wave$czech_movie_name[which.max(NWatel)]
max(NWexter)
max(NWatel)

non_New_Wave$czech_movie_name[which.max(nNWexter)]
non_New_Wave$czech_movie_name[which.max(nNWatel)]
max(nNWexter)
max(nNWatel)

#######################################################################################
#######################################################################################
#######################################################################################

### Now let's create boxplots for filming stage

New_Wave$filming_stage <- New_Wave$atelier_days + New_Wave$exterior_days
non_New_Wave$filming_stage <- non_New_Wave$atelier_days + non_New_Wave$exterior_days

NWfilming <- New_Wave$filming_stage
nNWfilming <- non_New_Wave$filming_stage

dev.off()

boxplot(NWfilming, nNWfilming,
        main = "Comparison filming stage",
        at = c(1,2),
        xlab = "number of days of filming stage",
        names = c("New Wave", "non New Wave"),
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### testing filming stage for normality

shapiro.test(data$filming_stage)

### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis
### is rejected and we suggest that the variable filming stage is not normally distributed

### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave)
### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test

wilcox.test(filming_stage ~ New_Wavedummy, data = data)

### We failed to reject the null hypothesis, thus the two samples are not different (from same populations)

### For curiosity and assurance we can make another test (Brunner-Munzel Test)

brunner.munzel.test(New_Wave$filming_stage, non_New_Wave$filming_stage, alternative = "t")

### At 0,05 significance level, we conclude that the filming stage data of New Wave
### and Non New Wave are identical populations (the same result as in the Mann-Whitney Test)

#######################################################################################
#######################################################################################
#######################################################################################

### Now let's create boxplots for production time

NWprod <- New_Wave$production_days
nNWprod <- non_New_Wave$production_days

boxplot(NWprod, nNWprod,
        main = "Comparison production days",
        at = c(1,2),
        xlab = "number of days of production",
        names = c("New Wave", "non New Wave"),
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### testing production days for normality

shapiro.test(data$production_days)

### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis
### is rejected and we suggest that the variable production days is not normally distributed

### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave)
### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test

wilcox.test(production_days ~ New_Wavedummy, data = data)

### We failed to reject the null hypothesis, thus the two samples are not different (from same populations)

### For curiosity and assurance we can make another test (Brunner-Munzel Test)

brunner.munzel.test(New_Wave$production_days, non_New_Wave$production_days, alternative = "t")

### At 0,05 significance level, we conclude that the production days data of New Wave
### and Non New Wave are identical populations (the same result as in the Mann-Whitney Test)

#######################################################################################
#######################################################################################
#######################################################################################

### Now let's create boxplots for production costs

NWcosts <- New_Wave$costs
nNWcosts <- non_New_Wave$costs

boxplot(NWcosts, nNWcosts,
        main = "Comparison of production costs",
        at = c(1,2),
        xlab = "costs in thousands",
        names = c("New Wave", "non New Wave"),
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### testing production costs for normality

shapiro.test(data$costs)

### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis
### is rejected and we suggest that the variable production costs is not normally distributed

### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave)
### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test

wilcox.test(costs ~ New_Wavedummy, data = data)

### We failed to reject the null hypothesis, thus the two samples are not different (from same populations)

### For curiosity and assurance we can make another test (Brunner-Munzel Test)

brunner.munzel.test(New_Wave$costs, non_New_Wave$costs, alternative = "t")

### At 0,05 significance level, we conclude that the production costs data of New Wave
### and Non New Wave are identical populations (the same result as in the Mann-Whitney Test)

### Further analysis for max, min

New_Wave$czech_movie_name[which.max(NWcosts)]
non_New_Wave$czech_movie_name[which.max(nNWcosts)]
max(New_Wave$costs)
max(non_New_Wave$costs)

n1 <- length(New_Wave$czech_movie_name)
n2 <- length(non_New_Wave$czech_movie_name)
New_Wave$czech_movie_name[which(New_Wave$costs == sort(New_Wave$costs, partial = n1 - 1)[n1 - 1])]
non_New_Wave$czech_movie_name[which(non_New_Wave$costs == sort(non_New_Wave$costs, partial = n2 - 1)[n2 - 1])]
sort(New_Wave$costs)[n1 - 1]
sort(non_New_Wave$costs)[n2 - 1]

New_Wave$czech_movie_name[which.min(New_Wave$costs)]
non_New_Wave$czech_movie_name[which.min(non_New_Wave$costs)]
min(New_Wave$costs)
min(non_New_Wave$costs)

#######################################################################################
#######################################################################################
#######################################################################################

### Now let's create boxplots for attendance

NWattendance <- New_Wave$attendance
nNWattendance <- non_New_Wave$attendance

boxplot(NWattendance, nNWattendance,
        main = "Comparison of attendance",
        at = c(1,2),
        xlab = "attendance in thousands",
        names = c("New Wave", "non New Wave"),
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### testing attendance for normality

shapiro.test(data$attendance)

### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis
### is rejected and we suggest that the variable attendance is not normally distributed

### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave)
### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test

wilcox.test(attendance ~ New_Wavedummy, data = data)

### The two samples are different (from different populations)

### For curiosity and assurance we can make another test (Brunner-Munzel Test)

brunner.munzel.test(New_Wave$attendance, non_New_Wave$attendance, alternative = "t")

### At 0,05 significance level, we conclude that the attendance data of New Wave
### and Non New Wave are nonidentical populations (the same result as in the Mann-Whitney Test)

### Further analysis for max, min

New_Wave$czech_movie_name[which(New_Wave$attendance == 0)]
non_New_Wave$czech_movie_name[which(non_New_Wave$attendance == 0)]

New_Wave$czech_movie_name[which.max(New_Wave$attendance)]
New_Wave$czech_movie_name[which(New_Wave$attendance == sort(New_Wave$attendance, partial = n1 - 1)[n1 - 1])]
max(New_Wave$attendance)
sort(New_Wave$attendance)[n1 - 1]

non_New_Wave$czech_movie_name[which.max(non_New_Wave$attendance)]
non_New_Wave$czech_movie_name[which(non_New_Wave$attendance == sort(non_New_Wave$attendance, partial = n1 - 1)[n1 - 1])]
max(non_New_Wave$attendance)
sort(non_New_Wave$attendance)[n2 - 1]

#######################################################################################
#######################################################################################
#######################################################################################

### Now let's create boxplots for gross sales

NWsales <- New_Wave$sales
nNWsales <- non_New_Wave$sales

boxplot(NWsales, nNWsales,
        main = "Comparison of sales",
        at = c(1,2),
        xlab = "gross sales in thousands",
        names = c("New Wave", "non New Wave"),
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### testing gross sales for normality

shapiro.test(data$sales)

### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis
### is rejected and we suggest that the variable gross sales is not normally distributed

### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave)
### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test

wilcox.test(sales ~ New_Wavedummy, data = data)

### The two samples are different (from different populations)

### For curiosity and assurance we can make another test (Brunner-Munzel Test)

brunner.munzel.test(New_Wave$sales, non_New_Wave$sales, alternative = "t")

### At 0,05 significance level, we conclude that the gross sales data of New Wave
### and Non New Wave are nonidentical populations (the same result as in the Mann-Whitney Test)

#######################################################################################
#######################################################################################
#######################################################################################

### Now let's create boxplots for gains

NWgains <- New_Wave$gains
nNWgains <- non_New_Wave$gains

boxplot(NWgains, nNWgains,
        main = "Comparison of gains",
        at = c(1,2),
        xlab = "gains in thousands",
        names = c("New Wave", "non New Wave"),
        col = c('cornflowerblue', 'darkorange'),
        border = "brown",
        horizontal = TRUE,
        notch = TRUE
)

### testing gains for normality

shapiro.test(data$gains)

### p-value is less than 0,05 (we use 0,05 significance level) therefore the null hypothesis
### is rejected and we suggest that the variable gains is not normally distributed

### The data are not normal, therefore for testing whether the two samples (New Wave and non New Wave)
### are similar, it has to be done the independent 2-group Mann-Whitney U Test, eventually Brunner-Munzel Test

wilcox.test(gains ~ New_Wavedummy, data = data)

### The two samples are different (from different populations)

### For curiosity and assurance we can make another test (Brunner-Munzel Test)

brunner.munzel.test(New_Wave$gains, non_New_Wave$gains, alternative = "t")

### At 0,05 significance level, we conclude that the gains of New Wave
### and Non New Wave are nonidentical populations (the same result as in the Brunner-Munzel Test)

### further analysis for max,min

New_Wave$czech_movie_name[which.min(New_Wave$gains)]
New_Wave$czech_movie_name[which(New_Wave$gains == sort(New_Wave$gains)[2])]
min(New_Wave$gains)
sort(New_Wave$gains)[2]

non_New_Wave$czech_movie_name[which.min(non_New_Wave$gains)]
non_New_Wave$czech_movie_name[which(non_New_Wave$gains == sort(non_New_Wave$gains)[2])]
min(non_New_Wave$gains)
sort(non_New_Wave$gains)[2]

New_Wave$czech_movie_name[which.max(New_Wave$gains)]
New_Wave$czech_movie_name[which(New_Wave$gains == sort(New_Wave$gains)[n1 - 1])]
max(New_Wave$gains)
sort(New_Wave$gains)[n1 - 1]

non_New_Wave$czech_movie_name[which.max(non_New_Wave$gains)]
non_New_Wave$czech_movie_name[which(non_New_Wave$gains == sort(non_New_Wave$gains)[n2 - 1])]
max(non_New_Wave$gains)
sort(non_New_Wave$gains)[n2 - 1]

mean(New_Wave$gains)
mean(non_New_Wave$gains)

#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################

### MODEL for production costs

### classic OLS regression model

ols <- lm(costs ~ production_days + atelier_days + exterior_days + New_Wavedummy + colordummy + formatdummy, data = data)
summary(ols)

### Let's test the model with Breusch-Pagan Test against heteroskedasticity

bptest(ols)

### Heteroskedasticity is definitelly present, therefore another models should be created, because the
### results of standard errors in the regression are unreliable.

### a robust standard errors are one solution for dealing with heteroskedasticity
### The author used the robust t test for creating the robust st.err. and for knowing
### whether the variables are statistically significant

coeftest(ols, vcov = vcovHC(ols, type = "HC0"))

### to test whether the whole model is statistically significant author use robust F test

ols2 <- lm(costs ~ production_days, data = data)

waldtest(ols2, ols, vcov = vcovHC(ols, type = "HC0"))

### the model is statistically significant

### another model using fgls that deals with heteroskedasticity

u_hat <- resid(ols)

u_hat2 <- log((u_hat)^2)

reg_u_hat2 <- lm(u_hat2 ~ production_days + atelier_days + exterior_days + New_Wavedummy + colordummy + formatdummy, data = data)

g_hat <- fitted(reg_u_hat2)

g_hat2 <- exp(g_hat)

W <- 1/g_hat2

fgls <- lm(costs ~ production_days + atelier_days + exterior_days + New_Wavedummy + colordummy + formatdummy, data = data, weights = W)
summary(fgls)

### Comparison of coefficients ONLY, NOT STANDARD ERRORS

stargazer(ols, fgls, column.labels = c("OLS", "FGLS"), type = "text", keep.stat = c("n", "rsq"))

### Final comparisons of format and color

a <- table(New_Wave$format)
b <- table(non_New_Wave$format)

a_values <- as.numeric(a)
NWratio_widescreen_classic <- a_values[2]/a_values[1]
NWratio_widescreen_classic

b_values <- as.numeric(b)
nNWratio_widescreen_classic <- b_values[2]/b_values[1]
nNWratio_widescreen_classic

#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################
#######################################################################################