library(dplyr) library(foreach) library(heplots) xyz_numeric_cor <- function(model_data) { column_property <- data.frame(class = sapply(model_data , class)) column_property$class <- as.character(column_property$class ) factor_cols <- rownames(column_property)[column_property$class == "factor"] numeric_cols <- rownames(column_property)[!column_property$class == "factor"] temp_df <- model_data[, c(numeric_cols)] temp_df[is.na(temp_df)] <- 0 # Simple Co-relation table df_numeric_numeric <- as.data.frame( cor(temp_df , method = "pearson" ) ) } # Both factors xyz_chi_cor <- function(model_data) { column_property <- data.frame(class = sapply(model_data , class)) column_property$class <- as.character(column_property$class ) factor_cols <- rownames(column_property)[column_property$class == "factor"] # Chisquare table df_ctg_ctg <- data.frame(row.names = factor_cols ) df_ctg_ctg[ , c(factor_cols)] <- NA # Chisquare table ### Association Strength for( i_col in colnames(df_ctg_ctg)){ row_data <- foreach(j_col = rownames(df_ctg_ctg)) %do% { ## Pearson CHI-squared Test for Count Data ## Null Hypothesis - Independence ## Crammer's V chi2 = chisq.test(model_data[, c(i_col)], model_data[, c(j_col)], correct = F ) normalize_factor = min(length(unique(model_data[, c(i_col)])) , length(unique(model_data[, c(j_col)])) ) v = sqrt(chi2$statistic / (length(model_data[, c(i_col)]) * normalize_factor) ) } # For each statement df_ctg_ctg[,i_col] <- unlist(row_data) } # 1st for Loop df_ctg_ctg } xyz_anova_cor <- function(model_data) { column_property <- data.frame(class = sapply(model_data , class)) column_property$class <- as.character(column_property$class ) factor_cols <- rownames(column_property)[column_property$class == "factor"] numeric_cols <- rownames(column_property)[!column_property$class == "factor"] model_data[, numeric_cols][is.na(model_data[, numeric_cols])] <- 0 # ANOVA table df_ctg_numeric <- data.frame(row.names = numeric_cols ) df_ctg_numeric[ , c(factor_cols)] <- NA # ANOVA TABLE for( i in colnames(df_ctg_numeric)){ write(paste("Column : ",i) , file = "text.log", append = TRUE) if (length(unique(model_data[ , i])) > 1) { row_data <- foreach(j = rownames(df_ctg_numeric)) %do% { ## 1 way ANOVA -- ## Null Hypotheis is Mean is same ## The "proportion of variance explained" measure R2R2 for multiple regression has an # ANOVA equivalent, (eta squared) # j ~ i Numeric ~ Categorical formula1 <- paste(j , "~" , i, sep = " ") aov2 <- (aov(as.formula(formula1) , data = model_data[, c(i, j)] )) # R-square # "https://stats.stackexchange.com/questions/78808/how-to-compute-eta2-in-anova-by-hand" v = 1 - var(aov2$residuals)/var(model_data[, c(j)] , na.rm = TRUE) } # If values present df_ctg_numeric[,i] <- unlist(row_data) } ## For each } # 1st For loop df_ctg_numeric }
Wednesday, February 28, 2018
Co-relation - numeric, categorical combination
Subscribe to:
Posts (Atom)
Self Attention
x → Embedding → MultiHeadAttention → Concat → Project to lower dim → → Add(x) → LayerNorm → FFN → Add → LayerNorm Vocab to embedding t...
-
get_stratified_positions < - function ( input_label_list , percentage ) { set . seed ( 123...
-
1st Method CLT : We calculate E[X] = Proportion of success H0 = proportion of fair coin = 0.5 Calculate p value proportion test. 2nd Method ...