3  Data importation & rarefaction

# Pkg install -----
packages <- c(
  "ggpubr", "rstatix", "broom", "ggplot2", "multcompView", "ggrepel", "ggpattern",
  "readxl", "tidyverse", "dplyr", "knitr", "kableExtra", "FactoMineR", "vegan",
  "phyloseq", "sjPlot", "pscl", "patchwork", "ggtext", "RColorBrewer",
  "zCompositions", "lme4", "emmeans", "metacoder", "foreach", "doParallel",
  "forcats", "scales", "colorspace", "gplots", "statmod", "ComplexUpset",
  "paletteer"
)

install.packages(setdiff(packages, installed.packages()[,"Package"]))

if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")

# qiime2R et metacoder depuis GitHub
devtools::install_github("jbisanz/qiime2R")
devtools::install_github("grunwaldlab/metacoder")

lapply(packages, library, character.only = TRUE)
[[1]]
[1] "ggpubr"    "ggplot2"   "stats"     "graphics"  "grDevices" "utils"    
[7] "datasets"  "methods"   "base"     

[[2]]
 [1] "rstatix"   "ggpubr"    "ggplot2"   "stats"     "graphics"  "grDevices"
 [7] "utils"     "datasets"  "methods"   "base"     

[[3]]
 [1] "broom"     "rstatix"   "ggpubr"    "ggplot2"   "stats"     "graphics" 
 [7] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[4]]
 [1] "broom"     "rstatix"   "ggpubr"    "ggplot2"   "stats"     "graphics" 
 [7] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[5]]
 [1] "multcompView" "broom"        "rstatix"      "ggpubr"       "ggplot2"     
 [6] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
[11] "methods"      "base"        

[[6]]
 [1] "ggrepel"      "multcompView" "broom"        "rstatix"      "ggpubr"      
 [6] "ggplot2"      "stats"        "graphics"     "grDevices"    "utils"       
[11] "datasets"     "methods"      "base"        

[[7]]
 [1] "ggpattern"    "ggrepel"      "multcompView" "broom"        "rstatix"     
 [6] "ggpubr"       "ggplot2"      "stats"        "graphics"     "grDevices"   
[11] "utils"        "datasets"     "methods"      "base"        

[[8]]
 [1] "readxl"       "ggpattern"    "ggrepel"      "multcompView" "broom"       
 [6] "rstatix"      "ggpubr"       "ggplot2"      "stats"        "graphics"    
[11] "grDevices"    "utils"        "datasets"     "methods"      "base"        

[[9]]
 [1] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
 [6] "readr"        "tidyr"        "tibble"       "tidyverse"    "readxl"      
[11] "ggpattern"    "ggrepel"      "multcompView" "broom"        "rstatix"     
[16] "ggpubr"       "ggplot2"      "stats"        "graphics"     "grDevices"   
[21] "utils"        "datasets"     "methods"      "base"        

[[10]]
 [1] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
 [6] "readr"        "tidyr"        "tibble"       "tidyverse"    "readxl"      
[11] "ggpattern"    "ggrepel"      "multcompView" "broom"        "rstatix"     
[16] "ggpubr"       "ggplot2"      "stats"        "graphics"     "grDevices"   
[21] "utils"        "datasets"     "methods"      "base"        

[[11]]
 [1] "knitr"        "lubridate"    "forcats"      "stringr"      "dplyr"       
 [6] "purrr"        "readr"        "tidyr"        "tibble"       "tidyverse"   
[11] "readxl"       "ggpattern"    "ggrepel"      "multcompView" "broom"       
[16] "rstatix"      "ggpubr"       "ggplot2"      "stats"        "graphics"    
[21] "grDevices"    "utils"        "datasets"     "methods"      "base"        

[[12]]
 [1] "kableExtra"   "knitr"        "lubridate"    "forcats"      "stringr"     
 [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
[11] "tidyverse"    "readxl"       "ggpattern"    "ggrepel"      "multcompView"
[16] "broom"        "rstatix"      "ggpubr"       "ggplot2"      "stats"       
[21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
[26] "base"        

[[13]]
 [1] "FactoMineR"   "kableExtra"   "knitr"        "lubridate"    "forcats"     
 [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
[11] "tibble"       "tidyverse"    "readxl"       "ggpattern"    "ggrepel"     
[16] "multcompView" "broom"        "rstatix"      "ggpubr"       "ggplot2"     
[21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
[26] "methods"      "base"        

[[14]]
 [1] "vegan"        "permute"      "FactoMineR"   "kableExtra"   "knitr"       
 [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
[11] "readr"        "tidyr"        "tibble"       "tidyverse"    "readxl"      
[16] "ggpattern"    "ggrepel"      "multcompView" "broom"        "rstatix"     
[21] "ggpubr"       "ggplot2"      "stats"        "graphics"     "grDevices"   
[26] "utils"        "datasets"     "methods"      "base"        

[[15]]
 [1] "phyloseq"     "vegan"        "permute"      "FactoMineR"   "kableExtra"  
 [6] "knitr"        "lubridate"    "forcats"      "stringr"      "dplyr"       
[11] "purrr"        "readr"        "tidyr"        "tibble"       "tidyverse"   
[16] "readxl"       "ggpattern"    "ggrepel"      "multcompView" "broom"       
[21] "rstatix"      "ggpubr"       "ggplot2"      "stats"        "graphics"    
[26] "grDevices"    "utils"        "datasets"     "methods"      "base"        

[[16]]
 [1] "sjPlot"       "phyloseq"     "vegan"        "permute"      "FactoMineR"  
 [6] "kableExtra"   "knitr"        "lubridate"    "forcats"      "stringr"     
[11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
[16] "tidyverse"    "readxl"       "ggpattern"    "ggrepel"      "multcompView"
[21] "broom"        "rstatix"      "ggpubr"       "ggplot2"      "stats"       
[26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
[31] "base"        

[[17]]
 [1] "pscl"         "sjPlot"       "phyloseq"     "vegan"        "permute"     
 [6] "FactoMineR"   "kableExtra"   "knitr"        "lubridate"    "forcats"     
[11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
[16] "tibble"       "tidyverse"    "readxl"       "ggpattern"    "ggrepel"     
[21] "multcompView" "broom"        "rstatix"      "ggpubr"       "ggplot2"     
[26] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
[31] "methods"      "base"        

[[18]]
 [1] "patchwork"    "pscl"         "sjPlot"       "phyloseq"     "vegan"       
 [6] "permute"      "FactoMineR"   "kableExtra"   "knitr"        "lubridate"   
[11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
[16] "tidyr"        "tibble"       "tidyverse"    "readxl"       "ggpattern"   
[21] "ggrepel"      "multcompView" "broom"        "rstatix"      "ggpubr"      
[26] "ggplot2"      "stats"        "graphics"     "grDevices"    "utils"       
[31] "datasets"     "methods"      "base"        

[[19]]
 [1] "ggtext"       "patchwork"    "pscl"         "sjPlot"       "phyloseq"    
 [6] "vegan"        "permute"      "FactoMineR"   "kableExtra"   "knitr"       
[11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
[16] "readr"        "tidyr"        "tibble"       "tidyverse"    "readxl"      
[21] "ggpattern"    "ggrepel"      "multcompView" "broom"        "rstatix"     
[26] "ggpubr"       "ggplot2"      "stats"        "graphics"     "grDevices"   
[31] "utils"        "datasets"     "methods"      "base"        

[[20]]
 [1] "RColorBrewer" "ggtext"       "patchwork"    "pscl"         "sjPlot"      
 [6] "phyloseq"     "vegan"        "permute"      "FactoMineR"   "kableExtra"  
[11] "knitr"        "lubridate"    "forcats"      "stringr"      "dplyr"       
[16] "purrr"        "readr"        "tidyr"        "tibble"       "tidyverse"   
[21] "readxl"       "ggpattern"    "ggrepel"      "multcompView" "broom"       
[26] "rstatix"      "ggpubr"       "ggplot2"      "stats"        "graphics"    
[31] "grDevices"    "utils"        "datasets"     "methods"      "base"        

[[21]]
 [1] "zCompositions" "survival"      "truncnorm"     "MASS"         
 [5] "RColorBrewer"  "ggtext"        "patchwork"     "pscl"         
 [9] "sjPlot"        "phyloseq"      "vegan"         "permute"      
[13] "FactoMineR"    "kableExtra"    "knitr"         "lubridate"    
[17] "forcats"       "stringr"       "dplyr"         "purrr"        
[21] "readr"         "tidyr"         "tibble"        "tidyverse"    
[25] "readxl"        "ggpattern"     "ggrepel"       "multcompView" 
[29] "broom"         "rstatix"       "ggpubr"        "ggplot2"      
[33] "stats"         "graphics"      "grDevices"     "utils"        
[37] "datasets"      "methods"       "base"         

[[22]]
 [1] "lme4"          "Matrix"        "zCompositions" "survival"     
 [5] "truncnorm"     "MASS"          "RColorBrewer"  "ggtext"       
 [9] "patchwork"     "pscl"          "sjPlot"        "phyloseq"     
[13] "vegan"         "permute"       "FactoMineR"    "kableExtra"   
[17] "knitr"         "lubridate"     "forcats"       "stringr"      
[21] "dplyr"         "purrr"         "readr"         "tidyr"        
[25] "tibble"        "tidyverse"     "readxl"        "ggpattern"    
[29] "ggrepel"       "multcompView"  "broom"         "rstatix"      
[33] "ggpubr"        "ggplot2"       "stats"         "graphics"     
[37] "grDevices"     "utils"         "datasets"      "methods"      
[41] "base"         

[[23]]
 [1] "emmeans"       "lme4"          "Matrix"        "zCompositions"
 [5] "survival"      "truncnorm"     "MASS"          "RColorBrewer" 
 [9] "ggtext"        "patchwork"     "pscl"          "sjPlot"       
[13] "phyloseq"      "vegan"         "permute"       "FactoMineR"   
[17] "kableExtra"    "knitr"         "lubridate"     "forcats"      
[21] "stringr"       "dplyr"         "purrr"         "readr"        
[25] "tidyr"         "tibble"        "tidyverse"     "readxl"       
[29] "ggpattern"     "ggrepel"       "multcompView"  "broom"        
[33] "rstatix"       "ggpubr"        "ggplot2"       "stats"        
[37] "graphics"      "grDevices"     "utils"         "datasets"     
[41] "methods"       "base"         

[[24]]
 [1] "metacoder"     "emmeans"       "lme4"          "Matrix"       
 [5] "zCompositions" "survival"      "truncnorm"     "MASS"         
 [9] "RColorBrewer"  "ggtext"        "patchwork"     "pscl"         
[13] "sjPlot"        "phyloseq"      "vegan"         "permute"      
[17] "FactoMineR"    "kableExtra"    "knitr"         "lubridate"    
[21] "forcats"       "stringr"       "dplyr"         "purrr"        
[25] "readr"         "tidyr"         "tibble"        "tidyverse"    
[29] "readxl"        "ggpattern"     "ggrepel"       "multcompView" 
[33] "broom"         "rstatix"       "ggpubr"        "ggplot2"      
[37] "stats"         "graphics"      "grDevices"     "utils"        
[41] "datasets"      "methods"       "base"         

[[25]]
 [1] "foreach"       "metacoder"     "emmeans"       "lme4"         
 [5] "Matrix"        "zCompositions" "survival"      "truncnorm"    
 [9] "MASS"          "RColorBrewer"  "ggtext"        "patchwork"    
[13] "pscl"          "sjPlot"        "phyloseq"      "vegan"        
[17] "permute"       "FactoMineR"    "kableExtra"    "knitr"        
[21] "lubridate"     "forcats"       "stringr"       "dplyr"        
[25] "purrr"         "readr"         "tidyr"         "tibble"       
[29] "tidyverse"     "readxl"        "ggpattern"     "ggrepel"      
[33] "multcompView"  "broom"         "rstatix"       "ggpubr"       
[37] "ggplot2"       "stats"         "graphics"      "grDevices"    
[41] "utils"         "datasets"      "methods"       "base"         

[[26]]
 [1] "doParallel"    "parallel"      "iterators"     "foreach"      
 [5] "metacoder"     "emmeans"       "lme4"          "Matrix"       
 [9] "zCompositions" "survival"      "truncnorm"     "MASS"         
[13] "RColorBrewer"  "ggtext"        "patchwork"     "pscl"         
[17] "sjPlot"        "phyloseq"      "vegan"         "permute"      
[21] "FactoMineR"    "kableExtra"    "knitr"         "lubridate"    
[25] "forcats"       "stringr"       "dplyr"         "purrr"        
[29] "readr"         "tidyr"         "tibble"        "tidyverse"    
[33] "readxl"        "ggpattern"     "ggrepel"       "multcompView" 
[37] "broom"         "rstatix"       "ggpubr"        "ggplot2"      
[41] "stats"         "graphics"      "grDevices"     "utils"        
[45] "datasets"      "methods"       "base"         

[[27]]
 [1] "doParallel"    "parallel"      "iterators"     "foreach"      
 [5] "metacoder"     "emmeans"       "lme4"          "Matrix"       
 [9] "zCompositions" "survival"      "truncnorm"     "MASS"         
[13] "RColorBrewer"  "ggtext"        "patchwork"     "pscl"         
[17] "sjPlot"        "phyloseq"      "vegan"         "permute"      
[21] "FactoMineR"    "kableExtra"    "knitr"         "lubridate"    
[25] "forcats"       "stringr"       "dplyr"         "purrr"        
[29] "readr"         "tidyr"         "tibble"        "tidyverse"    
[33] "readxl"        "ggpattern"     "ggrepel"       "multcompView" 
[37] "broom"         "rstatix"       "ggpubr"        "ggplot2"      
[41] "stats"         "graphics"      "grDevices"     "utils"        
[45] "datasets"      "methods"       "base"         

[[28]]
 [1] "scales"        "doParallel"    "parallel"      "iterators"    
 [5] "foreach"       "metacoder"     "emmeans"       "lme4"         
 [9] "Matrix"        "zCompositions" "survival"      "truncnorm"    
[13] "MASS"          "RColorBrewer"  "ggtext"        "patchwork"    
[17] "pscl"          "sjPlot"        "phyloseq"      "vegan"        
[21] "permute"       "FactoMineR"    "kableExtra"    "knitr"        
[25] "lubridate"     "forcats"       "stringr"       "dplyr"        
[29] "purrr"         "readr"         "tidyr"         "tibble"       
[33] "tidyverse"     "readxl"        "ggpattern"     "ggrepel"      
[37] "multcompView"  "broom"         "rstatix"       "ggpubr"       
[41] "ggplot2"       "stats"         "graphics"      "grDevices"    
[45] "utils"         "datasets"      "methods"       "base"         

[[29]]
 [1] "colorspace"    "scales"        "doParallel"    "parallel"     
 [5] "iterators"     "foreach"       "metacoder"     "emmeans"      
 [9] "lme4"          "Matrix"        "zCompositions" "survival"     
[13] "truncnorm"     "MASS"          "RColorBrewer"  "ggtext"       
[17] "patchwork"     "pscl"          "sjPlot"        "phyloseq"     
[21] "vegan"         "permute"       "FactoMineR"    "kableExtra"   
[25] "knitr"         "lubridate"     "forcats"       "stringr"      
[29] "dplyr"         "purrr"         "readr"         "tidyr"        
[33] "tibble"        "tidyverse"     "readxl"        "ggpattern"    
[37] "ggrepel"       "multcompView"  "broom"         "rstatix"      
[41] "ggpubr"        "ggplot2"       "stats"         "graphics"     
[45] "grDevices"     "utils"         "datasets"      "methods"      
[49] "base"         

[[30]]
 [1] "gplots"        "colorspace"    "scales"        "doParallel"   
 [5] "parallel"      "iterators"     "foreach"       "metacoder"    
 [9] "emmeans"       "lme4"          "Matrix"        "zCompositions"
[13] "survival"      "truncnorm"     "MASS"          "RColorBrewer" 
[17] "ggtext"        "patchwork"     "pscl"          "sjPlot"       
[21] "phyloseq"      "vegan"         "permute"       "FactoMineR"   
[25] "kableExtra"    "knitr"         "lubridate"     "forcats"      
[29] "stringr"       "dplyr"         "purrr"         "readr"        
[33] "tidyr"         "tibble"        "tidyverse"     "readxl"       
[37] "ggpattern"     "ggrepel"       "multcompView"  "broom"        
[41] "rstatix"       "ggpubr"        "ggplot2"       "stats"        
[45] "graphics"      "grDevices"     "utils"         "datasets"     
[49] "methods"       "base"         

[[31]]
 [1] "statmod"       "gplots"        "colorspace"    "scales"       
 [5] "doParallel"    "parallel"      "iterators"     "foreach"      
 [9] "metacoder"     "emmeans"       "lme4"          "Matrix"       
[13] "zCompositions" "survival"      "truncnorm"     "MASS"         
[17] "RColorBrewer"  "ggtext"        "patchwork"     "pscl"         
[21] "sjPlot"        "phyloseq"      "vegan"         "permute"      
[25] "FactoMineR"    "kableExtra"    "knitr"         "lubridate"    
[29] "forcats"       "stringr"       "dplyr"         "purrr"        
[33] "readr"         "tidyr"         "tibble"        "tidyverse"    
[37] "readxl"        "ggpattern"     "ggrepel"       "multcompView" 
[41] "broom"         "rstatix"       "ggpubr"        "ggplot2"      
[45] "stats"         "graphics"      "grDevices"     "utils"        
[49] "datasets"      "methods"       "base"         

[[32]]
 [1] "ComplexUpset"  "statmod"       "gplots"        "colorspace"   
 [5] "scales"        "doParallel"    "parallel"      "iterators"    
 [9] "foreach"       "metacoder"     "emmeans"       "lme4"         
[13] "Matrix"        "zCompositions" "survival"      "truncnorm"    
[17] "MASS"          "RColorBrewer"  "ggtext"        "patchwork"    
[21] "pscl"          "sjPlot"        "phyloseq"      "vegan"        
[25] "permute"       "FactoMineR"    "kableExtra"    "knitr"        
[29] "lubridate"     "forcats"       "stringr"       "dplyr"        
[33] "purrr"         "readr"         "tidyr"         "tibble"       
[37] "tidyverse"     "readxl"        "ggpattern"     "ggrepel"      
[41] "multcompView"  "broom"         "rstatix"       "ggpubr"       
[45] "ggplot2"       "stats"         "graphics"      "grDevices"    
[49] "utils"         "datasets"      "methods"       "base"         

[[33]]
 [1] "paletteer"     "ComplexUpset"  "statmod"       "gplots"       
 [5] "colorspace"    "scales"        "doParallel"    "parallel"     
 [9] "iterators"     "foreach"       "metacoder"     "emmeans"      
[13] "lme4"          "Matrix"        "zCompositions" "survival"     
[17] "truncnorm"     "MASS"          "RColorBrewer"  "ggtext"       
[21] "patchwork"     "pscl"          "sjPlot"        "phyloseq"     
[25] "vegan"         "permute"       "FactoMineR"    "kableExtra"   
[29] "knitr"         "lubridate"     "forcats"       "stringr"      
[33] "dplyr"         "purrr"         "readr"         "tidyr"        
[37] "tibble"        "tidyverse"     "readxl"        "ggpattern"    
[41] "ggrepel"       "multcompView"  "broom"         "rstatix"      
[45] "ggpubr"        "ggplot2"       "stats"         "graphics"     
[49] "grDevices"     "utils"         "datasets"      "methods"      
[53] "base"         
# Pkg -----
#library(DESeq2) #use bioconductor to install it with BiocManager
library(ggpubr)
library(rstatix)
library(broom)
library(ggplot2)
library(multcompView)
library(ggrepel)
library(ggpattern)
library(readxl)
library(tidyverse)
library(dplyr)
library(knitr)
library(kableExtra)
library(FactoMineR)
library(qiime2R) #remotes::install_github("jbisanz/qiime2R") # current version is 0.99.20 or if (!requireNamespace("devtools", quietly = TRUE)){install.packages("devtools")}
                                                                                                  #devtools::install_github("jbisanz/qiime2R")
library(vegan)
library(phyloseq)
library(sjPlot) #https://cran.r-project.org/web/packages/sjPlot/vignettes/tab_model_estimates.html
library(pscl)
library(patchwork)
library(ggtext) #install.packages("ggtext")
library(RColorBrewer)
library(zCompositions)
library(lme4)
library(emmeans)
library(readxl)
library(metacoder) # devtools::install_github("grunwaldlab/metacoder") # need rtools
library(foreach)
library(doParallel)
library(ggrepel)
library(forcats)
library(scales)
library(patchwork)
library(knitr)
library(colorspace)
library(gplots)
#library(edgeR) #I hope i not need that
#library(ade4TkGUI) #install.packages("ade4TkGUI")
library(statmod)
library(ComplexUpset)
library(paletteer) 

# function ---- 
source(here::here("src/function/collect_mcom_counts.R"))

# cosmetics ----
my_color_palette <- read_excel(here::here("data/color_palette.xlsm"))

set.seed(1) # set seed for analysis reproducibility
df_16S_taxonomy<-parse_taxonomy(read_qza(here::here("data/mcom/taxonomy_16S_soybean_2021.qza"))$data) %>% 
  dplyr::mutate(Kingdom=substr(Kingdom,4,nchar(Kingdom))) %>% 
  rename_all(tolower) %>% 
  rownames_to_column("ASV")

df_ITS_taxonomy<-parse_taxonomy(read_qza(here::here("data/mcom/taxonomy_ITS_soybean_2021.qza"))$data) %>% 
  dplyr::mutate(Kingdom=substr(Kingdom,4,nchar(Kingdom))) %>% 
  rename_all(tolower) %>% 
  rownames_to_column("ASV")
#Upload your QZA file containing the phylogenetic tree
#<- "data/test2/rooted-tree.qza"
df_16S_rooted_tree<-read_qza(here::here("data/mcom/rooted_tree_phylogeny_16S_soybean_2021.qza"))$data
df_16S_unrooted_tree<-read_qza(here::here("data/mcom/unrooted_tree_phylogeny_16S_soybean_2021.qza"))$data

#<- "data/test2/rooted-tree.qza"
df_ITS_rooted_tree<-read_qza(here::here("data/mcom/rooted_tree_phylogeny_ITS_soybean_2021.qza"))$data
df_ITS_unrooted_tree<-read_qza(here::here("data/mcom/unrooted_tree_phylogeny_ITS_soybean_2021.qza"))$data

#physeq <- merge_phyloseq(phyloseq_16S, tree)
# colnames(feature_table)[1] ="ASV_ID"
df_16S_count_chlorophyll_mito <- read.csv(here::here("data/mcom/asv_table_16S_soybean_2021.csv"), sep="\t",skip = 1) %>% 
      dplyr::rename_at(vars(starts_with("X")), #delet the A at the beginning 
                ~substring(.x,2)) %>% 
      dplyr::rename( "ASV"=".OTU.ID") %>%  #rename first column
      dplyr::rename_at(vars(ends_with(".effective.fastq")),
                 ~sub("\\.\\.?effective\\.fastq$","", .x))

df_16S_data_loss<-collect_mcom_counts(df_i = df_16S_count_chlorophyll_mito %>% column_to_rownames("ASV"),new_column_name = "raw_data_with_chloro_and_mito")
df_16S_count_chlorophyll_mito <- df_16S_count_chlorophyll_mito %>%
  anti_join(data.frame(ASV = df_16S_taxonomy %>% 
                         filter(phylum == "Cyanobacteria") %>%
                         pull("ASV")), 
            by = "ASV") %>% 
  as.data.frame() %>% 
  column_to_rownames("ASV")
# sum(as.matrix(df_16S_count[,2:length(df_16S_count)]))
# length(df_16S_count)

df_16S_data_loss<-collect_mcom_counts(storage = df_16S_data_loss, df_i = df_16S_count_chlorophyll_mito,new_column_name = "raw_without_Cyanobacteria_phylum")

df_16S_count_mito <- df_16S_count_chlorophyll_mito %>%
  rownames_to_column("ASV") %>% 
  anti_join(data.frame(ASV = df_16S_taxonomy %>% 
                         filter(genus == "Chloroplast") %>%
                         pull("ASV")), 
            by = "ASV") %>% 
  as.data.frame() %>% 
  column_to_rownames("ASV")
# sum(as.matrix(df_16S_count_mito[,2:length(df_16S_count_mito)]))
# length(df_16S_count_mito)
df_16S_data_loss<-collect_mcom_counts(storage = df_16S_data_loss, df_i = df_16S_count_mito,new_column_name = "raw_without_Chloroplast_genus")

df_16S_count <- df_16S_count_mito %>%
  rownames_to_column("ASV") %>% 
  anti_join(data.frame(ASV = df_16S_taxonomy %>% 
                         filter(genus == "Mitochondria") %>%
                         pull("ASV")), 
            by = "ASV") %>% 
  as.data.frame() %>% 
  column_to_rownames("ASV")
# sum(as.matrix(df_16S_count[,2:length(df_16S_count)]))
# length(df_16S_count)

df_16S_data_loss<-collect_mcom_counts(storage = df_16S_data_loss, df_i = df_16S_count,new_column_name = "raw_without_mitochondria_genus")
df_ITS_count <- read.csv(here::here("data/mcom/asv_table_ITS_soybean_2021.csv"), sep="\t",skip = 1) %>% 
      dplyr::rename_at(vars(starts_with("X")), #delet the A at the beginning 
                ~substring(.x,2)) %>% 
      dplyr::rename( "ASV"=".OTU.ID") %>%  #rename first column
      dplyr::rename_at(vars(ends_with(".effective.fastq")),
                 ~sub("\\.\\.?effective\\.fastq$","", .x)) %>% 
  column_to_rownames("ASV")
# for 16S
df_16S_metadata<-data.frame(sample_name=colnames(df_16S_count)) %>% 
  mutate(compartment= substr(sample_name,
                                     nchar(sample_name) -1,
                                           nchar(sample_name))) %>% #Add column compartment using last two value from rownames and add plant_num by using the number contain in the rownames
  mutate(plant_num=as.numeric(unlist(str_extract_all(sample_name, "\\d+")))) %>% 
  inner_join(.,read_xlsx(here::here("data/plant_information.xlsx")) %>% 
  dplyr::select("plant_num","condition","genotype","water_condition","heat_condition"), by="plant_num") %>% 
  mutate(compartment_l = case_when(
    compartment == "lo" ~ "Phylloplane",
    compartment == "li" ~ "Leaf endosphere",
    compartment == "ri" ~ "Root endosphere",
    compartment == "ro" ~ "Rhizoplane",
    compartment == "rh" ~ "Rhizosphere",
    compartment == "bk" ~ "Bulk soil",
    TRUE ~ NA_character_  # Keep the original value if none of the conditions match
  )) %>% 
  mutate(climat_condition=paste0(water_condition,"_",heat_condition)) %>% 
  mutate(climat_condition=factor(climat_condition,levels=c("WW_OT","WS_OT","WW_HS","WS_HS"))) %>% 
  mutate(compartment_l=factor(compartment_l,levels=c("Phylloplane","Leaf endosphere","Root endosphere","Rhizoplane","Rhizosphere","Bulk soil")))

# For ITS
df_ITS_metadata<-data.frame(sample_name=colnames(df_ITS_count)) %>% 
  mutate(compartment= substr(sample_name,
                                     nchar(sample_name) -1,
                                           nchar(sample_name))) %>% #Add column compartment using last two value from rownames and add plant_num by using the number contain in the rownames
  mutate(plant_num=as.numeric(unlist(str_extract_all(sample_name, "\\d+")))) %>% 
  inner_join(.,read_xlsx(here::here("data/plant_information.xlsx")) %>% 
  dplyr::select("plant_num","condition","genotype","water_condition","heat_condition"), by="plant_num") %>% 
  mutate(compartment_l = case_when(
    compartment == "lo" ~ "Phylloplane",
    compartment == "li" ~ "Leaf endosphere",
    compartment == "ri" ~ "Root endosphere",
    compartment == "ro" ~ "Rhizoplane",
    compartment == "rh" ~ "Rhizosphere",
    compartment == "bk" ~ "Bulk soil",
    TRUE ~ NA_character_  # Keep the original value if none of the conditions match
  )) %>% 
  mutate(climat_condition=paste0(water_condition,"_",heat_condition)) %>% 
  mutate(climat_condition=factor(climat_condition,levels=c("WW_OT","WS_OT","WW_HS","WS_HS"))) %>% 
  mutate(compartment_l=factor(compartment_l,levels=c("Phylloplane","Leaf endosphere","Root endosphere","Rhizoplane","Rhizosphere","Bulk soil")))

For 16S there is a total of 240 samples and 7471 ASV at this step and a total of 1.056717e+07 count.

For ITS there is a total of 144 samples and 7235 ASV at this step and a total of 1.428166e+07 count.

plot_x=df_16S_data_loss %>% 
  pivot_longer(-sample_name,names_to = "step",values_to = 'count') %>% 
  left_join(.,df_16S_metadata, by="sample_name") %>% 
  dplyr::group_by(compartment_l,step) %>%
        dplyr::summarise(mean_count = mean(count),
                         N=n(),
                         sd=sd(count),
                         se=sd(count)/sqrt(N),
                         .groups = "drop") %>% 
  ggplot(aes(x=step,y=mean_count,col=compartment_l,group = compartment_l))+
           geom_errorbar(aes(ymin=mean_count-se,
                    ymax=mean_count+se),
                width = 0.2,
                position = position_dodge(width = 0.2)) +
  geom_line(position = position_dodge(width = 0.2)) +
  geom_point(position = position_dodge(width = 0.2))+
  guides(col = guide_legend(title = "Compartment"))+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.2))+
  scale_color_manual(
    values = my_color_palette %>%  filter(set=="compartment") %>% pull(color),
    name=c(my_color_palette %>% filter(set=="compartment") %>% dplyr::select(treatment) %>% pull(treatment))
    )
ggsave("report/plot/loos_Each_step.svg", 
       plot_x, width = 11,height = 11, units = "cm")

3.1 Alpha-diversity

df_16S_count=df_16S_count %>% 
  rownames_to_column("ASV") %>% 
  pivot_longer(-ASV, names_to = "sample_name", values_to = "count") %>% 
  filter(!sample_name %in% c("74bisro","1094rh","1118ri","1060ri","1046li")) %>% 
  dplyr::group_by(sample_name) %>% 
  mutate(total=sum(count)) %>% 
  filter(total>16000) %>% 
  dplyr::group_by(ASV) %>% 
  mutate(total=sum(count)) %>% 
  filter(total!=0) %>% 
  ungroup() %>% 
  dplyr::select(-total) %>% 
  pivot_wider(names_from = "sample_name", values_from = "count") %>% 
  column_to_rownames("ASV")

df_16S_data_loss<-collect_mcom_counts(storage = df_16S_data_loss, df_i = df_16S_count,new_column_name = "raw_first_filter")
#
# test=df_16S_data_loss %>%
#   pivot_longer(-sample_name,names_to = "step",values_to = 'count') %>%
#   left_join(.,df_16S_metadata, by="sample_name") %>%
#   filter(step=="raw_first_filter") %>%
#   drop_na(count) %>%
#   dplyr::group_by(compartment_l,condition) %>%
#         dplyr::summarise(mean_count = mean(count,na.rm = TRUE),
#                          N=n(),
#                          sd=sd(count,na.rm = TRUE),
#                          se=sd(count,na.rm = TRUE)/sqrt(N),
#                          min=min(count),
#                          .groups = "drop")

save(df_16S_count,file=here::here(paste0("data/mcom/output/save/df_16S_count_importation_amplicon.RData")))
save(df_16S_data_loss,file=here::here(paste0("data/mcom/output/save/df_16S_data_loss_importation_amplicon.RData")))
#Update of the medadata
df_16S_metadata=df_16S_metadata %>% 
  inner_join(data.frame(sample_name = df_16S_count %>% 
                         t() %>% 
                         as.data.frame() %>% 
                         rownames_to_column("sample_name") %>% 
                         pull(sample_name)), 
            by = "sample_name") %>% 
  mutate(compartment_l=factor(compartment_l,levels=c("Phylloplane","Root endosphere","Rhizoplane","Rhizosphere","Bulk soil")))

save(df_16S_metadata,file=here::here(paste0("data/mcom/output/save/df_16S_metadata_importation_amplicon.RData")))

Microbial ecologists explore sequencing depth though a rarefaction curve. The rarefaction curve shows how many new ASVs are observed when we obtain new reads for a given sample. If the sequencing depth is enough, we should observe a plateau, meaning that even if we sequence new reads they will belong to ASVs already observed. In other word, all the diversity present in a sample is already described and we have sequenced the community deeply enough. This analysis should be execute on the raw data.

rarecurve_data=vegan::rarecurve(df_16S_count %>% t(), step = 100, cex = 0.75, las = 1) 
rarecurve_data_ITS=vegan::rarecurve(df_ITS_count %>% t(), step = 100, cex = 0.75, las = 1) 

min_n_seqs=min(
  df_16S_count %>% 
  rownames_to_column("ASV") %>% 
  pivot_longer(-ASV, names_to = "sample_name", values_to = "count") %>% 
  dplyr::group_by(sample_name) %>% 
  dplyr::summarise(total=sum(count)) %>% 
  pull(total)
  )

min_n_seqs_ITS=min(
  df_ITS_count %>% 
  rownames_to_column("ASV") %>% 
  pivot_longer(-ASV, names_to = "sample_name", values_to = "count") %>% 
  dplyr::group_by(sample_name) %>% 
  dplyr::summarise(total=sum(count)) %>% 
  pull(total)
  )

PlotAlpha_rarefaction<-map_dfr(rarecurve_data,bind_rows) %>% 
  bind_cols(Group=rownames(df_16S_count %>% t()),.) %>% 
  pivot_longer(-Group) %>% 
  drop_na() %>% 
  mutate(n_seqs=as.numeric(str_replace(name,"N",""))) %>% 
  dplyr::select(-name) %>% 
  ggplot(aes(x=n_seqs,y=value,group=Group))+
  geom_vline (xintercept=min_n_seqs,color="red")+
  geom_vline (xintercept=16000,color="green")+
  geom_line()+
  theme_classic()

PlotAlpha_rarefaction_ITS<-map_dfr(rarecurve_data_ITS,bind_rows) %>% 
  bind_cols(Group=rownames(df_ITS_count %>% t()),.) %>% 
  pivot_longer(-Group) %>% 
  drop_na() %>% 
  mutate(n_seqs=as.numeric(str_replace(name,"N",""))) %>% 
  dplyr::select(-name) %>% 
  ggplot(aes(x=n_seqs,y=value,group=Group))+
  geom_vline (xintercept=min_n_seqs_ITS,color="red")+
  geom_vline (xintercept=83000,color="green")+
  geom_line()+
  theme_classic()

ggsave(here::here("report/plot/Rarecurve_all.svg"),PlotAlpha_rarefaction ,width = 18,height = 16, units = "cm")
ggsave(here::here("report/plot/Rarecurve_all_ITS.svg"),PlotAlpha_rarefaction_ITS ,width = 18,height = 16, units = "cm")

# Same but for each compartment
source(here::here("src/function/f_rarecurve_compartment.R"))

# for 16S
PlotsAlphaCompartment=
  rarecurve_compartment("lo")+
#  rarecurve_compartment("li")+
  rarecurve_compartment("ri")+
  rarecurve_compartment("ro")+
  rarecurve_compartment("rh")+
  rarecurve_compartment("bk")+
#guide_area() + #tres utile pour mettre dans un trou pour ne pas perdre de place 
#plot_annotation(tag_levels = 'A')+
plot_layout(ncol = 2) 

# for ITS
PlotsAlphaCompartment_ITS=
  rarecurve_compartment("lo",type = "ITS")+
  rarecurve_compartment("li", type = "ITS")+
  rarecurve_compartment("ri", type = "ITS")+
  rarecurve_compartment("ro", type = "ITS")+
  rarecurve_compartment("rh", type = "ITS")+
  rarecurve_compartment("bk", type = "ITS")+
#guide_area() + #tres utile pour mettre dans un trou pour ne pas perdre de place 
#plot_annotation(tag_levels = 'A')+
plot_layout(ncol = 2) 

#export 
ggsave(here::here("report/plot/Rarecurve_compartment.svg"),PlotsAlphaCompartment ,width = 29,height = 28, units = "cm")
ggsave(here::here("report/plot/Rarecurve_compartment_ITS.svg"),PlotsAlphaCompartment_ITS ,width = 29,height = 28, units = "cm")
# export data for next script 
## For 16S
save(df_16S_count,file=here::here(paste0("data/mcom/output/save/df_16S_count_importation_amplicon.RData")))
save(df_16S_data_loss,file=here::here(paste0("data/mcom/output/save/df_16S_data_loss_importation_amplicon.RData")))
save(df_16S_metadata,file=here::here(paste0("data/mcom/output/save/df_16S_metadata_importation_amplicon.RData")))
save(df_16S_taxonomy,file=here::here(paste0("data/mcom/output/save/df_16S_taxonomy_importation_amplicon.RData")))
save(df_16S_rooted_tree,file=here::here(paste0("data/mcom/output/save/df_16S_rooted_tree_importation_amplicon.RData"))) # not use in this script but after

## For ITS
save(df_ITS_count,file=here::here(paste0("data/mcom/output/save/df_ITS_count_importation_amplicon.RData")))
save(df_ITS_metadata,file=here::here(paste0("data/mcom/output/save/df_ITS_metadata_importation_amplicon.RData")))
save(df_ITS_taxonomy,file=here::here(paste0("data/mcom/output/save/df_ITS_taxonomy_importation_amplicon.RData")))
save(df_ITS_rooted_tree,file=here::here(paste0("data/mcom/output/save/df_ITS_rooted_tree_importation_amplicon.RData"))) # not use in this script but after