lundi 6 juillet 2020

Piece of code influences random numbers in foreach output

I run a simulation using foreach and doParallel and struggling with random numbers (named random in the code).

In a nutshell: I simulate a football league, randomly generating winners of all the matches and corresponding results. In dt_base no match was played, in dt_ex1 and dt_ex2 results of 4 matches are known already. All unknown results should be simulated.

In the League Simulation Code at the bottom of this post I set 1000 simulations, split into 100 chunks (the forloop is used to send data to PostgreSQL and reduce RAM usage in the full code I use). I expect all the random numbers to be different (don't even insist on reproducible results).

1. When running the code as given, one should achieve the goal of all different random numbers.

> # ====== Distinct Random Numbers ======
> length(unique(out$random))                              # expectation: 22000
[1] 22000
> length(unique(out$random[out$part == "base"]))          # expectation: 10000
[1] 10000
> length(unique(out$random[out$part == "dt_ex1"]))        # expectation: 6000
[1] 6000
> length(unique(out$random[out$part == "dt_ex2"]))        # expectation: 6000
[1] 6000

2. Now please uncomment the pieces of code which assigns the final score *[tmp_sim] = 3 (should be lines 60,61,67,68 with !!! on them) and run it again.

> # ====== Distinct Random Numbers ======
> length(unique(out$random))                              # expectation: 22000
[1] 10360
> length(unique(out$random[out$part == "base"]))          # expectation: 10000
[1] 10000
> length(unique(out$random[out$part == "dt_ex1"]))        # expectation: 6000
[1] 180
> length(unique(out$random[out$part == "dt_ex2"]))        # expectation: 6000
[1] 180

That is when it gets messed up and it doesn't make sense to me. random inside iter is always the same for dt_ex1 and dt_ex2 when adding couple of numbers into these dataframes.

Are you experiencing the same effect? Any idea what is going on please?

I tried R versions 3.5.3 and 3.6.3. Also tried doRNG package. Always the same problem.

League Simulation Code

# League Simulation
rm(list = ls())
set.seed(666)
cat("\014")
library(sqldf)
library(plyr)
library(dplyr)

# ====== User Functions ======
comb4 = function(x, ...) { #function for combining foreach output
  Map(rbind, x, ...)
}

# ====== Data Preparation ======
dt_base = data.frame(id = 1:10,
                  part = rep("base",10),
                  random = NA)

dt_ex1 = data.frame(id = 1:10,
                         part = rep("dt_ex1",10),
                         HG = c(1,3,6,NA,NA,2,NA,NA,NA,NA),  # Home Goals
                         AG = c(1,3,6,NA,NA,2,NA,NA,NA,NA),  # Away Goals
                         random = NA)

dt_ex2 = data.frame(id = 1:10,
                            part = rep("dt_ex2",10),
                         HG = c(1,3,6,NA,NA,2,NA,NA,NA,NA),  # Home Goals
                         AG = c(1,3,6,NA,NA,2,NA,NA,NA,NA),  # Away Goals
                         random = NA)

# ====== Set Parallel Computing ======
library(foreach)
library(doParallel)

cl = makeCluster(3, outfile = "")
registerDoParallel(cl)

# ====== SIMULATION ======
nsim = 1000                # number of simulations
iterChunk = 100            # split nsim into this many chunks
out = data.frame()    # prepare output DF
for(iter in 1:ceiling(nsim/iterChunk)){
  strt = Sys.time()
  
  out_iter = 
    foreach(i = 1:iterChunk, .combine = comb4, .multicombine = TRUE, .maxcombine = 100000, .inorder = FALSE, .verbose = FALSE,
            .packages = c("plyr", "dplyr", "sqldf")) %dopar% {
              
              ## PART 1
              # simulation number
              id_sim = iterChunk * (iter - 1) + i
              
              # First random numbers set
              dt_base[,"random"] = runif(nrow(dt_base))
              
              
              ## PART 2
              tmp_sim = is.na(dt_ex1$HG) # no results yet
              dt_ex1$random[tmp_sim] = runif(sum(tmp_sim))
              # dt_ex1$HG[tmp_sim] = 3   # !!!
              # dt_ex1$AG[tmp_sim] = 3   # !!!
              
              
              ## PART 3
              tmp_sim = is.na(dt_ex2$HG) # no results yet
              dt_ex2$random[tmp_sim] = runif(sum(tmp_sim))
              # dt_ex2$HG[tmp_sim] = 3   # !!!
              # dt_ex2$AG[tmp_sim] = 3   # !!!
              
              
              # ---- Save Results
              zapasy = rbind.data.frame(dt_base[,c("id","part","random")],
                                        dt_ex1[,c("id","part","random")]
                                        ,dt_ex2[,c("id","part","random")]
              )
              zapasy$id_sim = id_sim
              zapasy$iter = iter
              zapasy$i = i
              
              out_i = list(zapasy = zapasy)
              
              print(Sys.time())
              return(out_i)
            }#i;sim_forcycle
  
  out = rbind.data.frame(out,subset(out_iter$zapasy, !is.na(random)))
  
  fnsh = Sys.time()
  cat(" [",iter,"] ",fnsh - strt, sep = "")
  
}#iter


# ====== Distinct Random Numbers ======
length(unique(out$random))                              # expectation: 22000
length(unique(out$random[out$part == "base"]))          # expectation: 10000
length(unique(out$random[out$part == "dt_ex1"]))        # expectation: 6000
length(unique(out$random[out$part == "dt_ex2"]))        # expectation: 6000


# ====== Stop Parallel Computing ======
stopCluster(cl)




Aucun commentaire:

Enregistrer un commentaire