Nested analyses

Dewey Dunnington

2023-01-17

Load the packages:

library(tidyverse)
library(tidypaleo)

Creating nested data

Preparing the data:

alta_lake_geochem
#> # A tibble: 192 × 9
#>    location param depth   age value stdev units     n zone  
#>    <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <chr> <int> <chr> 
#>  1 ALGC2    Cu     0.25 2015.  76   NA    ppm       1 Zone 3
#>  2 ALGC2    Cu     0.75 2011. 108.   4.50 ppm       3 Zone 3
#>  3 ALGC2    Cu     1.25 2008. 158   NA    ppm       1 Zone 3
#>  4 ALGC2    Cu     1.75 2003. 169   NA    ppm       1 Zone 3
#>  5 ALGC2    Cu     2.5  1998. 161   NA    ppm       1 Zone 3
#>  6 ALGC2    Cu     3.5  1982. 129   NA    ppm       1 Zone 3
#>  7 ALGC2    Cu     4.5  1966.  88.7  3.86 ppm       3 Zone 2
#>  8 ALGC2    Cu     5.5  1947.  65   NA    ppm       1 Zone 2
#>  9 ALGC2    Cu     6.5  1922.  62.3  9.53 ppm       3 Zone 2
#> 10 ALGC2    Cu     7.5  1896.  48   NA    ppm       1 Zone 2
#> # … with 182 more rows
alta_nested <- nested_data(
  alta_lake_geochem,
  qualifiers = c(age, depth, zone),
  key = param,
  value = value,
  trans = scale
)

alta_nested
#> # A tibble: 1 × 4
#>   discarded_columns discarded_rows   qualifiers        data             
#> * <list>            <list>           <list>            <list>           
#> 1 <tibble [32 × 0]> <tibble [0 × 9]> <tibble [32 × 4]> <tibble [32 × 6]>
alta_nested %>% unnested_data(data)
#> # A tibble: 32 × 6
#>     C[,1] `C/N`[,1] Cu[,1] Ti[,1] d13C[,1] d15N[,1]
#>     <dbl>     <dbl>  <dbl>  <dbl>    <dbl>    <dbl>
#>  1 -1.54      1.26  -0.794  0.807    1.03   -0.670 
#>  2 -1.59      1.36  -0.559  1.33     1.19    0.0499
#>  3 -1.98      0.960 -0.721  0.682    1.10   -0.511 
#>  4 -0.189     1.61  -0.749  0.233    0.836  -2.37  
#>  5  0.993     2.48  -0.694  0.908    1.06   -2.55  
#>  6 -0.157     1.76  -0.712  0.941    1.17   -1.52  
#>  7 -0.642     1.36  -0.667  1.14     1.07   -1.39  
#>  8 -1.07      0.924 -0.559  1.16     0.820  -0.439 
#>  9 -0.722     0.696 -0.830  0.932    0.765  -0.929 
#> 10 -0.631     0.309 -0.504  0.882    0.409  -0.166 
#> # … with 22 more rows
alta_nested %>% unnested_data(qualifiers, data)
#> # A tibble: 32 × 10
#>      age depth zone   row_number  C[,1] `C/N`[,1] Cu[,1] Ti[,1] d13C[,1] d15N[…¹
#>    <dbl> <dbl> <chr>       <int>  <dbl>     <dbl>  <dbl>  <dbl>    <dbl>   <dbl>
#>  1 1550   29.5 Zone 1          1 -1.54      1.26  -0.794  0.807    1.03  -0.670 
#>  2 1566.  28.5 Zone 1          2 -1.59      1.36  -0.559  1.33     1.19   0.0499
#>  3 1581.  27.5 Zone 1          3 -1.98      0.960 -0.721  0.682    1.10  -0.511 
#>  4 1597.  26.5 Zone 1          4 -0.189     1.61  -0.749  0.233    0.836 -2.37  
#>  5 1613.  25.5 Zone 1          5  0.993     2.48  -0.694  0.908    1.06  -2.55  
#>  6 1629.  24.5 Zone 1          6 -0.157     1.76  -0.712  0.941    1.17  -1.52  
#>  7 1644.  23.5 Zone 1          7 -0.642     1.36  -0.667  1.14     1.07  -1.39  
#>  8 1660.  22.5 Zone 1          8 -1.07      0.924 -0.559  1.16     0.820 -0.439 
#>  9 1676.  21.5 Zone 1          9 -0.722     0.696 -0.830  0.932    0.765 -0.929 
#> 10 1692.  20.5 Zone 1         10 -0.631     0.309 -0.504  0.882    0.409 -0.166 
#> # … with 22 more rows, and abbreviated variable name ¹​d15N[,1]

Principal components analysis

pca <- alta_nested %>% nested_prcomp()
pca
#> # A tibble: 1 × 8
#>   discarded_col…¹ discar…² qualif…³ data     model    variance loadings scores  
#> * <list>          <list>   <list>   <list>   <list>   <list>   <list>   <list>  
#> 1 <tibble>        <tibble> <tibble> <tibble> <prcomp> <tibble> <tibble> <tibble>
#> # … with abbreviated variable names ¹​discarded_columns, ²​discarded_rows,
#> #   ³​qualifiers
plot(pca)

pca %>% unnested_data(qualifiers, scores)
#> # A tibble: 32 × 10
#>      age depth zone   row_number   PC1     PC2     PC3     PC4     PC5     PC6
#>    <dbl> <dbl> <chr>       <int> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#>  1 1550   29.5 Zone 1          1 -2.48 -0.273   0.409   0.518  -0.316   0.0538
#>  2 1566.  28.5 Zone 1          2 -2.48 -0.675   0.902   0.561   0.364   0.190 
#>  3 1581.  27.5 Zone 1          3 -2.41 -0.721   0.576   0.465  -0.619   0.184 
#>  4 1597.  26.5 Zone 1          4 -2.39  1.73   -0.527   0.197  -0.742  -0.0656
#>  5 1613.  25.5 Zone 1          5 -2.73  2.82   -0.666   0.212   0.432  -0.0715
#>  6 1629.  24.5 Zone 1          6 -2.56  1.22   -0.155   0.235   0.208   0.105 
#>  7 1644.  23.5 Zone 1          7 -2.57  0.708   0.123  -0.0210  0.0114 -0.0286
#>  8 1660.  22.5 Zone 1          8 -2.04 -0.243   0.490   0.101   0.157  -0.0834
#>  9 1676.  21.5 Zone 1          9 -1.98  0.0702 -0.0657 -0.147  -0.0813 -0.186 
#> 10 1692.  20.5 Zone 1         10 -1.20 -0.376   0.202  -0.132   0.202  -0.220 
#> # … with 22 more rows
pca %>% unnested_data(variance)
#> # A tibble: 6 × 6
#>   component component_text standard_deviation variance variance_propor…¹ varia…²
#>       <int> <chr>                       <dbl>    <dbl>             <dbl>   <dbl>
#> 1         1 PC1                         2.15    4.61             0.768     0.768
#> 2         2 PC2                         0.884   0.781            0.130     0.899
#> 3         3 PC3                         0.603   0.364            0.0607    0.959
#> 4         4 PC4                         0.381   0.145            0.0242    0.984
#> 5         5 PC5                         0.276   0.0761           0.0127    0.996
#> 6         6 PC6                         0.151   0.0228           0.00380   1    
#> # … with abbreviated variable names ¹​variance_proportion,
#> #   ²​variance_proportion_cumulative
pca %>% unnested_data(loadings)
#> # A tibble: 6 × 7
#>   variable    PC1     PC2    PC3    PC4     PC5     PC6
#>   <chr>     <dbl>   <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
#> 1 C         0.380  0.540  -0.460 -0.162  0.567   0.0718
#> 2 C/N      -0.401  0.451   0.207  0.735  0.223  -0.0408
#> 3 Cu        0.387  0.340   0.760 -0.173 -0.0609  0.352 
#> 4 Ti       -0.439 -0.0783  0.356 -0.483  0.565  -0.350 
#> 5 d13C     -0.458 -0.0890 -0.144 -0.145  0.126   0.851 
#> 6 d15N      0.377 -0.613   0.144  0.386  0.539   0.149

Constrained hierarchical clustering

keji_nested <- keji_lakes_plottable %>%
  group_by(location) %>%
  nested_data(qualifiers = depth, key = taxon, value = rel_abund)

keji_nested %>% unnested_data(qualifiers, data)
#> # A tibble: 37 × 9
#>    location        depth row_num…¹ Aster…² Aulac…³ Aulac…⁴ Cyclo…⁵ Tabel…⁶ Other
#>    <chr>           <dbl>     <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl>
#>  1 Beaverskin Lake 0.125         1   0        3.02    1.72    3.45   0      91.8
#>  2 Beaverskin Lake 0.375         2   0        3.25    2.03    5.69   1.63   87.4
#>  3 Beaverskin Lake 0.825         3   1.32     0       2.32    4.30   1.32   90.7
#>  4 Beaverskin Lake 2.12          4   0.333    0       2.67    3      1.67   92.3
#>  5 Beaverskin Lake 3.12          5   0        0       5.32    6.98   1.33   86.4
#>  6 Beaverskin Lake 4.12          6   0        0       3.54   13.2    0.643  82.6
#>  7 Beaverskin Lake 5.38          7   0.987    0      10.2    12.8    2.63   73.4
#>  8 Beaverskin Lake 6.38          8   0.993    0       8.94   17.5    3.97   68.5
#>  9 Beaverskin Lake 7.62          9   1.63     0       8.82   20.3    2.29   67.0
#> 10 Beaverskin Lake 9.12         10   0.328    0      10.8    23.9    2.95   62.0
#> # … with 27 more rows, and abbreviated variable names ¹​row_number,
#> #   ²​`Asterionella ralfsii var. americana (large)`, ³​`Aulacoseira distans`,
#> #   ⁴​`Aulacoseira lirata`, ⁵​`Cyclotella stelligera`,
#> #   ⁶​`Tabellaria flocculosa (strain III)`
coniss <- keji_nested %>% 
  nested_chclust_coniss()

plot(coniss, main = location)

plot(coniss, main = location, xvar = qualifiers$depth, labels = "")

coniss %>% select(location, zone_info) %>% unnest(zone_info)
#> # A tibble: 4 × 12
#>   location       hclus…¹ min_d…² max_d…³ first…⁴ last_…⁵ min_r…⁶ max_r…⁷ first…⁸
#>   <chr>            <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#> 1 Beaverskin La…       1   0.125    4.12   0.125    4.12       1       6       1
#> 2 Beaverskin La…       2   5.38    23.4    5.38    23.4        7      17       7
#> 3 Peskawa Lake         1   0.125    5.12   0.125    5.12       1       5       1
#> 4 Peskawa Lake         2   6.38    35.1    6.38    35.1        6      20       6
#> # … with 3 more variables: last_row_number <dbl>, boundary_depth <dbl>,
#> #   boundary_row_number <dbl>, and abbreviated variable names ¹​hclust_zone,
#> #   ²​min_depth, ³​max_depth, ⁴​first_depth, ⁵​last_depth, ⁶​min_row_number,
#> #   ⁷​max_row_number, ⁸​first_row_number
keji_nested %>%
  nested_chclust_coniss(n_groups = c(3, 2)) %>%
  select(location, zone_info) %>% 
  unnested_data(zone_info)
#> # A tibble: 5 × 12
#>   location       hclus…¹ min_d…² max_d…³ first…⁴ last_…⁵ min_r…⁶ max_r…⁷ first…⁸
#>   <chr>            <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#> 1 Beaverskin La…       1   0.125    4.12   0.125    4.12       1       6       1
#> 2 Beaverskin La…       2   5.38    13.6    5.38    13.6        7      13       7
#> 3 Beaverskin La…       3  15.1     23.4   15.1     23.4       14      17      14
#> 4 Peskawa Lake         1   0.125    5.12   0.125    5.12       1       5       1
#> 5 Peskawa Lake         2   6.38    35.1    6.38    35.1        6      20       6
#> # … with 3 more variables: last_row_number <dbl>, boundary_depth <dbl>,
#> #   boundary_row_number <dbl>, and abbreviated variable names ¹​hclust_zone,
#> #   ²​min_depth, ³​max_depth, ⁴​first_depth, ⁵​last_depth, ⁶​min_row_number,
#> #   ⁷​max_row_number, ⁸​first_row_number

Unconstrained hierarchical clustering

halifax_nested <- halifax_lakes_plottable %>%
  nested_data(c(location, sample_type), taxon, rel_abund, fill = 0)

halifax_nested %>% unnested_data(qualifiers, data)
#> # A tibble: 20 × 9
#>    location        sampl…¹ row_n…² Aulac…³ Eunot…⁴ Fragi…⁵ Tabel…⁶ Tabel…⁷ Other
#>    <chr>           <chr>     <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl>
#>  1 Anderson Lake   bottom        1   4.65    2.42    1.16    4.11    3.76   83.9
#>  2 Anderson Lake   top           2   1.87    0       0.330   5.71    5.71   86.4
#>  3 Bayers          bottom        3  11.6     5.71    4.50    4.84    0      73.4
#>  4 Bayers          top           4   0.993   6.81    3.40    3.40    0      85.4
#>  5 Bell Lake       bottom        5   0.476   2.62    0.833   2.14    6.90   87.0
#>  6 Bell Lake       top           6   9.22    0.432   2.59    6.20    1.01   80.5
#>  7 Cranberry Lake  bottom        7   0       9.17   11.6     3.06    9.39   66.8
#>  8 Cranberry Lake  top           8   0       7.72    8.94    0.203   8.54   74.6
#>  9 Frasers Lake    bottom        9   6.42    0.714   0.624   1.87    2.59   87.8
#> 10 Frasers Lake    top          10   4.85    0       0       8.58   10.6    76.0
#> 11 Kinsac lake     bottom       11  11.3     8.33    2.98   12.7     0.992  63.7
#> 12 Kinsac lake     top          12   0       3.85    2.75    3.30    0      90.1
#> 13 Little Albro L… bottom       13   2.34    2.34    1.91    1.70    6.69   85.0
#> 14 Little Albro L… top          14   5.78    2.61    1.90    0       5.78   83.9
#> 15 Little Springf… bottom       15  12.1     2.64    6.23   10       0      69.1
#> 16 Little Springf… top          16   0      19.8    14.1    11.4     0      54.6
#> 17 Maynard Lake    bottom       17   9.75    4       1.5    11.6     7.88   65.2
#> 18 Maynard Lake    top          18   2.98    1.23    2.26    7.51   10.1    75.9
#> 19 Miller Lake     bottom       19   1.79    2.19    2.19    0.299  15.6    77.9
#> 20 Miller Lake     top          20   0.816   4.35    2.90    1.27    1.18   89.5
#> # … with abbreviated variable names ¹​sample_type, ²​row_number,
#> #   ³​`Aulacoseira distans`, ⁴​`Eunotia exigua`, ⁵​`Fragilariforma exigua`,
#> #   ⁶​`Tabellaria fenestrata`, ⁷​`Tabellaria flocculosa (strain IV)`
hclust <- halifax_nested %>%
  nested_hclust(method = "average")

plot(
  hclust, 
  labels = sprintf(
    "%s (%s)",
    qualifiers$location,
    qualifiers$sample_type
  )
)

Nested analysis of other functions

alta_nested %>%
  nested_analysis(vegan::rda, data) %>%
  plot()

biplot(pca)