Generate test sets for venndir

make_venn_test(
  n_items = 1000000,
  n_sets = 4,
  do_signed = FALSE,
  concordance = 0.5,
  min_size = ceiling(n_items/50),
  max_size = ceiling(n_items/2),
  items = NULL,
  sizes = NULL,
  seed = 123,
  item_prefix = "item_",
  ...
)

Arguments

n_items

integer total number of items available to all sets, also known as the universe size.

n_sets

integer number of sets that contain items.

do_signed

logical indicating whether to return signed sets, which indicate directionality with -1 or 1 values, named by the items.

concordance

numeric between -1 and 1, used when do_signed=TRUE. This value imposes an approximate amount of concordance between random pairs of sets, using the concordance equation: concordance = (agree - disagree) / (agree + disgree) where (agree + disagree) = n. This equation approximates the number of items that agree as: agree = ceiling((concordance * n + n) / 2).

min_size

integer minimum range of items that may be contained in each set.

max_size

integer maximum range of items that may be contained in each set.

items

vector or NULL that contains the universe of items. When items is defined, n_items is ignored.

sizes

vector of integer values, or NULL, indicating the size of each set. When sizes is defined, min_size and max_size is ignored. When sizes is defined, names(sizes) are used as names for each set.

seed

numeric or NULL used with set.seed() for data reproducibility. When seed=NULL then set.seed() is not called.

item_prefix

character string used as prefix for item names, default "item_".

...

additional arguments are ignored.

Value

list of items, either as a list of item vectors, or when do_signed=TRUE the list of vectors, where vector names contain the items, and vector values are signed values from c(-1, 1).

Details

This function generates data to use as test input to Venn diagram functions. It can generate sets of items, or signed sets (integer values -1, 1) named by item.

This function defines a range of set sizes, using min_size and max_size, with roughly square-root sequence of sizes between these two extremes.

Note that the universe size represents the total available items, but not necessarily the total number of items represented by the sets. For example, if n_items=1000000, max_size=500 and n_sets=3 then the maximum number of items actually represented is 1500.

The universe can be defined using optional argument items, which takes priority over n_items.

The specific size of each set can be defined with optional argument sizes, which takes priority over min_size, and max_size.

Examples

## basic setlist without signed direction
setlist <- make_venn_test(n_items=100,
   n_sets=3,
   min_size=5,
   max_size=25)
set_im <- list2im_opt(setlist);
table(jamba::pasteByRow(as.matrix(set_im)*1))
#> 
#> 0_0_1 0_1_0 0_1_1 1_0_0 1_0_1 1_1_0 1_1_1 
#>     9    12     5    10     3     5     1 

## basic setlist with signed direction
setlist <- make_venn_test(n_items=100,
   n_sets=3,
   do_signed=TRUE)
jamba::sdim(setlist);
#>       rows   class
#> set_A   32 numeric
#> set_B   16 numeric
#> set_C   15 numeric

## some example overlap summaries
sv1 <- signed_overlaps(setlist=setlist, "overlap")
sv1
#>                                      sets overlap num_sets count set_A set_B
#> set_A|1 0 0                         set_A   1 0 0        1    16     1     0
#> set_B|0 1 0                         set_B   0 1 0        1     7     0     1
#> set_C|0 0 1                         set_C   0 0 1        1     7     0     0
#> set_A&set_B|1 1 0             set_A&set_B   1 1 0        2     8     1     1
#> set_A&set_C|1 0 1             set_A&set_C   1 0 1        2     7     1     0
#> set_B&set_C|0 1 1             set_B&set_C   0 1 1        2     0     0     1
#> set_A&set_B&set_C|1 1 1 set_A&set_B&set_C   1 1 1        3     1     1     1
#>                         set_C overlap_label
#> set_A|1 0 0                 0             1
#> set_B|0 1 0                 0             1
#> set_C|0 0 1                 1             1
#> set_A&set_B|1 1 0           0           1 1
#> set_A&set_C|1 0 1           1           1 1
#> set_B&set_C|0 1 1           1           1 1
#> set_A&set_B&set_C|1 1 1     1         1 1 1

## Familiar named overlap counts
jamba::nameVector(sv1[,c("count","sets")])
#>             set_A             set_B             set_C       set_A&set_B 
#>                16                 7                 7                 8 
#>       set_A&set_C       set_B&set_C set_A&set_B&set_C 
#>                 7                 0                 1 

## directional count table for each combination
sv2 <- signed_overlaps(setlist=setlist, "each")
sv2
#>                                      sets    each overlap num_sets count set_A
#> set_A|-1 0 0                        set_A  -1 0 0   1 0 0        1     7     1
#> set_A|1 0 0                         set_A   1 0 0   1 0 0        1     9     1
#> set_B|0 -1 0                        set_B  0 -1 0   0 1 0        1     3     0
#> set_B|0 1 0                         set_B   0 1 0   0 1 0        1     4     0
#> set_C|0 0 -1                        set_C  0 0 -1   0 0 1        1     2     0
#> set_C|0 0 1                         set_C   0 0 1   0 0 1        1     5     0
#> set_A&set_B|-1 -1 0           set_A&set_B -1 -1 0   1 1 0        2     3     1
#> set_A&set_B|1 -1 0            set_A&set_B  1 -1 0   1 1 0        2     2     1
#> set_A&set_B|1 1 0             set_A&set_B   1 1 0   1 1 0        2     3     1
#> set_A&set_C|-1 0 -1           set_A&set_C -1 0 -1   1 0 1        2     3     1
#> set_A&set_C|-1 0 1            set_A&set_C  -1 0 1   1 0 1        2     1     1
#> set_A&set_C|1 0 -1            set_A&set_C  1 0 -1   1 0 1        2     1     1
#> set_A&set_C|1 0 1             set_A&set_C   1 0 1   1 0 1        2     2     1
#> set_B&set_C|0 1 1             set_B&set_C   0 1 1   0 1 1        2     0     0
#> set_A&set_B&set_C|1 1 1 set_A&set_B&set_C   1 1 1   1 1 1        3     1     1
#>                         set_B set_C overlap_label
#> set_A|-1 0 0                0     0            -1
#> set_A|1 0 0                 0     0             1
#> set_B|0 -1 0                1     0            -1
#> set_B|0 1 0                 1     0             1
#> set_C|0 0 -1                0     1            -1
#> set_C|0 0 1                 0     1             1
#> set_A&set_B|-1 -1 0         1     0         -1 -1
#> set_A&set_B|1 -1 0          1     0          1 -1
#> set_A&set_B|1 1 0           1     0           1 1
#> set_A&set_C|-1 0 -1         0     1         -1 -1
#> set_A&set_C|-1 0 1          0     1          -1 1
#> set_A&set_C|1 0 -1          0     1          1 -1
#> set_A&set_C|1 0 1           0     1           1 1
#> set_B&set_C|0 1 1           1     1           1 1
#> set_A&set_B&set_C|1 1 1     1     1         1 1 1

## directional count table for agreement or mixed
sv3 <- signed_overlaps(setlist=setlist, "agreement")
sv3
#>                                          sets agreement overlap num_sets count
#> set_A|agreement                         set_A agreement   1 0 0        1    16
#> set_B|agreement                         set_B agreement   0 1 0        1     7
#> set_C|agreement                         set_C agreement   0 0 1        1     7
#> set_A&set_B|agreement             set_A&set_B agreement   1 1 0        2     6
#> set_A&set_B|mixed                 set_A&set_B     mixed   1 1 0        2     2
#> set_A&set_C|agreement             set_A&set_C agreement   1 0 1        2     5
#> set_A&set_C|mixed                 set_A&set_C     mixed   1 0 1        2     2
#> set_B&set_C|0 1 1                 set_B&set_C agreement   0 1 1        2     0
#> set_A&set_B&set_C|agreement set_A&set_B&set_C agreement   1 1 1        3     1
#>                             set_A set_B set_C overlap_label
#> set_A|agreement                 1     0     0     agreement
#> set_B|agreement                 0     1     0     agreement
#> set_C|agreement                 0     0     1     agreement
#> set_A&set_B|agreement           1     1     0     agreement
#> set_A&set_B|mixed               1     1     0         mixed
#> set_A&set_C|agreement           1     0     1     agreement
#> set_A&set_C|mixed               1     0     1         mixed
#> set_B&set_C|0 1 1               0     1     1     agreement
#> set_A&set_B&set_C|agreement     1     1     1     agreement

## signed incidence matrix
imv <- list2im_value(setlist)
dim(imv)
#> [1] 46  3
head(imv)
#>          set_A set_B set_C
#> item_067    -1     0     0
#> item_042     1     0     0
#> item_050     1     0    -1
#> item_043    -1     0     0
#> item_014    -1     0     0
#> item_025     1    -1     0

## text venn diagram
textvenn(setlist, overlap_type="overlap")
#>                                 set_A&set_B                                     
#>                                      8                                          
#>    set_A                                                              set_B     
#>     16                                                                  7       
#>                                                                                 
#>                              set_A&set_B&set_C                                  
#>                                      1                                          
#>              set_A&set_C                              set_B&set_C               
#>                   7                                        0                    
#>                                                                                 
#>                                                                                 
#>                                    set_C                                        
#>                                      7                                          

## text venn diagram with signed direction
textvenn(setlist, overlap_type="each")
#>                                        set_A&set_B     ↑↑: 3                                       
#>                                             8          ↑↓: 2                                       
#>    set_A  ↑: 9                                         ↓↓: 3                          set_B  ↑: 4  
#>     16    ↓: 7                                                                          7    ↓: 3  
#>                                                                                                    
#>                                     set_A&set_B&set_C  ↑↑↑: 1                                      
#>                                             1                                                      
#>                 set_A&set_C  ↑↑: 2                                set_B&set_C  ↑↑: 0               
#>                      7       ↑↓: 1                                     0                           
#>                              ↓↑: 1                                                                 
#>                              ↓↓: 3                                                                 
#>                                           set_C         ↑: 5                                       
#>                                             7           ↓: 2