library(zddr)

Base Case - A Tibble

As a base case, consider a trivial example of a dataframe containing the the cross multiplication of several long lists of variables.

library(magrittr)
start_time <- Sys.time()
tb<-tibble::tibble(
  a = list(c( 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20)),
  #b = list(c(21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40)),
  c = list(c(41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60)),
  d = list(c(61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80)),
  e = list(c(81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100))
) %>%
  tidyr::unnest(a) %>%
  #tidyr::unnest(b) %>%
  tidyr::unnest(c) %>%
  tidyr::unnest(d) %>%
  tidyr::unnest(e) %>%
  dplyr::rowwise() %>%
  dplyr::transmute(X = list(c(a,c,d,e)))
Sys.time() - start_time
#> Time difference of 4.363413 secs
tb
#> # A tibble: 160,000 x 1
#> # Rowwise: 
#>    X        
#>    <list>   
#>  1 <dbl [4]>
#>  2 <dbl [4]>
#>  3 <dbl [4]>
#>  4 <dbl [4]>
#>  5 <dbl [4]>
#>  6 <dbl [4]>
#>  7 <dbl [4]>
#>  8 <dbl [4]>
#>  9 <dbl [4]>
#> 10 <dbl [4]>
#> # … with 159,990 more rows
pryr::object_size(tb) 
#> Registered S3 method overwritten by 'pryr':
#>   method      from
#>   print.bytes Rcpp
#> 24.3 MB

The tibble is an inefficient way to deal with sets, especially if there is a need to remove nonminimal sets. For example, say the above family add a single set of 1, making all existing sets containing 1 nonminimal.

start_time <- Sys.time()
tb %>% dplyr::filter(!1 %in% X) 
#> # A tibble: 152,000 x 1
#> # Rowwise: 
#>    X        
#>    <list>   
#>  1 <dbl [4]>
#>  2 <dbl [4]>
#>  3 <dbl [4]>
#>  4 <dbl [4]>
#>  5 <dbl [4]>
#>  6 <dbl [4]>
#>  7 <dbl [4]>
#>  8 <dbl [4]>
#>  9 <dbl [4]>
#> 10 <dbl [4]>
#> # … with 151,990 more rows
Sys.time() - start_time
#> Time difference of 0.615664 secs

Introduce the ZDD

Perform this same operation using a ZDD from this package and note the space efficiency achieved.

library(zddr)
reset_zdd_store()
#> Resetting zdd_store.
start_time <- Sys.time()
zdd_or(   1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20) *
  #zdd_or(21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40) *
  zdd_or(41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60) *
  zdd_or(61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80) *
  zdd_or(81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100)
#> Memory of ZDD Store:     0.25 MB, item count: 218 ( 1.15 KB/item)
#> Memory of ZDD Functions: 0.077 MB, item count: 358 ( 0.22 KB/item)
#> 5643917c867818eab1444c603aa23cfe : 160000 cutsets, min order: 4 max order: 4 
#> 4-order 
#>  160000 
#> 
#> More than 100 cutsets, not printing all those!
Sys.time() - start_time           
#> Time difference of 0.3063161 secs

The importance of variable ordering

Now if the same number of elements are used, but in random order, note the difference in memory and time. This is a minor illustration of the importance of variable ordering.

reset_zdd_store()
#> Resetting zdd_store.
start_time <- Sys.time()
zdd_or(  85,44,24,41, 5,20,74,71,56,52,92,76,42,89, 65,16, 1,38,10,97) *
  #zdd_or(21, 2,25,36,63,22,61,46,70,62,90,29,91,60, 79,83,51,39,81,58) *
  zdd_or( 7,40,31,26,72,45,34,57,49,67,80,87,86,19,  4,59,17,14,75,53) *
  zdd_or( 6,68,32,78,33,13,73,69,18,11,98,77,82,93, 95,23,99,96,50,88) *
  zdd_or(43,15,94,84,66,64,54, 8,47,28,55,35,48,37,100, 9, 3,30,12,27)
#> Memory of ZDD Store:     1.15 MB, item count: 1016 ( 1.13 KB/item)
#> Memory of ZDD Functions: 0.923 MB, item count: 4773 ( 0.19 KB/item)
#> a6ba8b4945c86ab9205c731e2a68494a : 160000 cutsets, min order: 4 max order: 4 
#> 4-order 
#>  160000 
#> 
#> More than 100 cutsets, not printing all those!
Sys.time() - start_time           
#> Time difference of 4.463271 secs

Minimizing the family of sets

Check out how the size of the ZDD and the calculation time change when a single non-minimal ZDD is unioned with the first one above.

reset_zdd_store()
#> Resetting zdd_store.
start_time <- Sys.time()
zdd_or(   1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20) *
  #zdd_or(21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40) *
  zdd_or(41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60) *
  zdd_or(61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80) *
  zdd_or(81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100) |
  zdd(1) # 19*20^3+1 = 152,001
#> Memory of ZDD Store:     0.343 MB, item count: 296 ( 1.16 KB/item)
#> Memory of ZDD Functions: 0.143 MB, item count: 518 ( 0.28 KB/item)
#> 2d9f36c80c55b599caf00862b5468143 : 152001 cutsets, min order: 1 max order: 4 
#> 1-order 4-order 
#>       1  152000 
#> 
#> More than 100 cutsets, not printing all those!
Sys.time() - start_time            
#> Time difference of 0.4411922 secs

Using the same example, add one additional union of a non-minimal ZDD and watch the change in ZDD complexity (memory) and calculation time.

reset_zdd_store()
#> Resetting zdd_store.
start_time <- Sys.time()
zdd_or(   1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20) *
  #zdd_or(21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40) *
  zdd_or(41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60) *
  zdd_or(61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80) *
  zdd_or(81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100) |
  zdd(1) | zdd(2) # 18*20^3+2 = 144,002
#> Memory of ZDD Store:     0.43 MB, item count: 373 ( 1.15 KB/item)
#> Memory of ZDD Functions: 0.177 MB, item count: 675 ( 0.26 KB/item)
#> e68e61973d287829f8ff1457c1d149cb : 144002 cutsets, min order: 1 max order: 4 
#> 1-order 4-order 
#>       2  144000 
#> 
#> More than 100 cutsets, not printing all those!
Sys.time() - start_time            
#> Time difference of 0.6408699 secs

Minimize based on a higher ranked variable

There is a slight reduction in time and space needs when minimizing a large family of sets against a higher ranked variable. (Compare to the original minimization example above.)

reset_zdd_store()
#> Resetting zdd_store.
start_time <- Sys.time()
(zdd_or(  1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20) *
    #zdd_or(21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40) *
    zdd_or(41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60) *
    zdd_or(61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80) *
    zdd_or(81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100)) | 
  zdd(90) # 19*20^3+1 = 152,001
#> Memory of ZDD Store:     0.267 MB, item count: 229 ( 1.17 KB/item)
#> Memory of ZDD Functions: 0.114 MB, item count: 382 ( 0.3 KB/item)
#> 6e02f648e12b0a1d609061b52aea62cb : 152001 cutsets, min order: 1 max order: 4 
#> 1-order 4-order 
#>       1  152000 
#> 
#> More than 100 cutsets, not printing all those!
Sys.time() - start_time
#> Time difference of 0.3703039 secs

Minimize against a more complicated set

reset_zdd_store()
#> Resetting zdd_store.
start_time <- Sys.time()
(zdd_or(   1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20) *
    #zdd_or(21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40) *
    zdd_or(41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60) *
    zdd_or(61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80) *
    zdd_or(81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100)) |
  zdd_and(1,90) 
#> Memory of ZDD Store:     0.334 MB, item count: 288 ( 1.16 KB/item)
#> Memory of ZDD Functions: 0.142 MB, item count: 514 ( 0.28 KB/item)
#> 426b3c4daedc32ea00c4371b4f683b99 : 159601 cutsets, min order: 2 max order: 4 
#> 2-order 4-order 
#>       1  159600 
#> 
#> More than 100 cutsets, not printing all those!
Sys.time() - start_time
#> Time difference of 0.4271069 secs