dimanche 19 juin 2022

R: Unexpected Nulls Appearing in Output

I am working with the R programming language.

I was able to run the following code without any problems:

# first code: works fine
library(dplyr)
library(ranger)

original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)),  data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )

original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)

test_set=  rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])

train_set = anti_join(original_data, test_set)

# Step 2: Create "Balanced" Random Subsets:

results <- list()
for (i in 1:100)
   
{
   iteration_i = i
   
    sample_i =  rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
   
    results_tmp = data.frame(iteration_i, sample_i)
    results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
   results[[i]] <- results_tmp
   
}

results_df <- do.call(rbind.data.frame, results)

X<-split(results_df, results_df$iteration)

 invisible(lapply(seq_along(results),
       function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
       x=results))

I am now trying to run the same code in parallel - here is my attempt:

# second code: does not work fine
library(doParallel)
library(foreach)

registerDoParallel(cores = detectCores())
foreach(i = 1:100) %dopar% {
    
    
    results <- list()
    
    {
        iteration_i = i
        
        sample_i =  rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
        
        results_tmp = data.frame(iteration_i, sample_i)
        results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
        results[[i]] <- results_tmp
        
    }
    
    results_df <- do.call(rbind.data.frame, results)
    
    X<-split(results_df, results_df$iteration)
    
    invisible(lapply(seq_along(results),
                     function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
                     x=results))
    
}

stopImplicitCluster()

The code appears to have run - but unlike the first code, no additional files were created in the global environment and a series of nulls have appeared now:

[[100]][[91]]
NULL

[[100]][[92]]
NULL

[[100]][[93]]
NULL

[[100]][[94]]
NULL

[[100]][[95]]
NULL

[[100]][[96]]
NULL

[[100]][[97]]
NULL

[[100]][[98]]
NULL

[[100]][[99]]
NULL

My Question: Can someone please show me what I am doing wrong and how can I make the second code run like the first code?

Thanks!




Aucun commentaire:

Enregistrer un commentaire