\encoding{UTF-8}
\name{chem16S-package}
\alias{chem16S-package}
\alias{chem16S}
\docType{package}
\title{Chemical metrics for microbial communities}

\description{

Functions and data to calculate chemical metrics for reference proteomes for microbial (archaeal and bacterial) communities.
Amino acid compositions of community reference proteomes are generated by combining reference proteomes of taxa (derived from GTDB or RefSeq) with taxonomic classifications of 16S rRNA gene sequences.

}

\details{

  \itemize{
    \item \code{\link{read_RDP}} - Read and filter an RDP Classifier table
    \item \code{\link{map_taxa}} - Map RDP Classifier assignments to the NCBI taxonomy
    \item \code{\link{get_metrics}} - Get chemical metrics for community reference proteomes
    \item \code{\link{get_metadata}} - Example function for retrieving sample metadata
    \item \code{\link{plot_metrics}} - Plot metrics with symbols and colors based on metadata
    \item \code{\link{physeq}} - Functions designed to analyze \code{\link[phyloseq]{phyloseq-class}} objects
  }

History:
  \enumerate{
    \item Work begun in 2021. The combination of RefSeq reference proteomes with taxonomic abundances to compute community-level chemical metrics was described by Dick and Tan (2023). \pkg{chem16S} originated as code in the \pkg{JMDplots} package (\url{https://github.com/jedick/JMDplots}).
    \item Development in 2022. Dick and Meng (2023) compared community \Zc with redox potential measurements from local to global scales. The term \dQuote{community reference proteomes} was first applied, and \pkg{chem16S} was split into a separate package.
    \item Late 2022. GTDB r207 was added as a reference database.
    \item June--July 2023. Integration with \pkg{phyloseq} and addition of vignettes: \emph{Chemical metrics of reference proteomes}, \emph{Integration of chem16S with phyloseq}, and \emph{Plotting two chemical metrics}. Default reference database changed to GTDB r207.
    \item April 2024. Updated to GTDB r214.
    \item July 2024. Updated to GTDB r220.
  }

}

\section{Options set in package chem16S}{

  \pkg{chem16S} sets an option using the global \code{\link{options}} mechanism in R.
  This option will be set when package \pkg{chem16S} (or its namespace) is loaded if not already set.

  \describe{

    \item{\code{manual_mappings}}{
      A data frame of mappings between RDP and NCBI (RefSeq) taxonomies, which is read from \file{extdata/manual_mappings.csv}.
      The columns include \code{RDP.rank}, \code{RDP.name}, \code{NCBI.rank}, \code{NCBI.name}, and \code{notes}.
      This option is made available so the user can modify the manual mappings used by \code{\link{map_taxa}} at runtime.
    }

  }

}


\section{Files in RefDB/RefSeq_206}{

  NOTE: None of the \file{*.R} files in the \file{extdata} directories are included in the package submitted to CRAN; see GitHub or Zenodo for these files.

  This directory contains two sets of files:
    1) scripts to process source RefSeq sequence files to generate amino acid compositions of species-level reference proteomes and taxonomic names;
    2) script and output for amino acid compositions of higher-level taxa.
  The files are based on RefSeq release 206 of 2021-05-21 (O'Leary et al., 2016).

  \describe{

    \item{\file{README.txt}}{Description of steps to generate reference proteomes of species-level taxa (including downloads and shell commands).}
    \item{\file{gencat.sh}}{Helper script to extract microbial protein records from the RefSeq catalog.}
    \item{\file{genome_AA.R}}{
      R code to sum the amino acid compositions of all proteins for each bacterial, archaeal, and viral species in the NCBI Reference Sequence database.
      NOTE: To save space in this package, the output file (\file{genome_AA.csv}) is stored in the \code{RefDB/RefSeq_206} directory of the JMDplots package on GitHub (\url{https://github.com/jedick/JMDplots}.
      The first five columns are: \code{protein} (\dQuote{refseq}), \code{organism} (taxonomic id), \code{ref} (organism name), \code{abbrv} (empty), \code{chains} (number of protein sequences for this organism).
      Columns 6 to 25 have the counts of amino acids.
    }
    \item{\file{taxonomy.R}}{
      R code for processing taxonomic IDs; the output file is \file{taxonomy.csv}.
      The columns are NCBI taxonomic ID (taxid), and names at different taxonomic rank (species, genus, family, order, class, phylum, superkingdom).
    }

    \item{\file{taxon_AA.R}}{Functions to create the files listed below:}
    \item{\file{taxon_AA.csv.xz}}{Average amino acid composition of reference proteomes for all species in each genus, family, order, class, phylum, and superkingdom.}

  }

}

\section{Files in RefDB/GTDB_220}{

  \describe{
    \item{\file{taxon_AA.R}}{Functions to process GTDB source files (Parks et al., 2022) and produce the following output file:}
    \item{\file{taxon_AA.csv.xz}}{
      Average amino acid composition of reference proteomes for all species in each genus, family, order, class, phylum, and domain.
      In both this file and the corresponding file for RefSeq (see above), the \code{protein}, \code{organism}, \code{ref}, and \code{abbrv} columns contain the rank, taxon name, number of species used to generate the amino acid composition of this taxon, and parent taxon.
      \code{chains} is \code{1}, denoting a single polypeptide chain, so the amino acid composition represents the average per-protein amino acid composition in this taxon, and the sum of amino acid counts is the average protein length.
    }
  }

}

\section{Files in extdata/metadata}{

  \describe{
    \item{\file{BGPF13.csv}}{Metadata for Heart Lake Geyser Basin, Yellowstone (Bowen De León et al., 2012).}
    \item{\file{HLA+16.csv}}{Metadata for the Baltic Sea (Herlemann et al., 2016).}
    \item{\file{SMS+12.csv}}{Metadata for Bison Pool, Yellowstone (Swingley et al., 2012).}
  }

}

\section{Files in extdata/RDP}{

  Output of RDP Classifier with the default training set.

  \describe{
    \item{\file{pipeline.R}}{
      Pipeline for sequence data processing (uses external programs fastq-dump, vsearch, seqtk, RDP Classifier).
      This was used to make the files in both \file{RDP} and \file{RDP-GTDB_220} (the latter with \code{GTDB = TRUE} in the script).
    }
    \item{\file{BGPF13.tab.xz}}{Heart Lake Geyser Basin.}
    \item{\file{HLA+16.tab.xz}}{Baltic Sea.}
    \item{\file{SMS+12.tab.xz}}{Bison Pool.}
  }

}

\section{Files in extdata/RDP-GTDB_220}{

  Output of RDP Classifer trained with 16S rRNA sequences from GTDB release 220 (\doi{https://doi.org/10.5281/zenodo.7633099}).

  \describe{
    \item{\file{BGPF13.tab.xz}}{Heart Lake Geyser Basin.}
    \item{\file{HLA+16.tab.xz}}{Baltic Sea.}
    \item{\file{SMS+12.tab.xz}}{Bison Pool.}
  }

}

\section{Files in extdata/DADA2-GTDB_220}{

  Identification and taxonomic classification of sequences using DADA2 with GTDB r220.

  \describe{
    \item{\file{FEN+22}}{
      Analysis of data from Fonseca et al. (2022) for marine sediment from the Humboldt Sulfuretum.
      \file{pipeline.R} has the commands used to process the 16S rRNA gene sequence data and was adapted by Jeffrey Dick from the DADA2 pipeline tutorial (Callahan, 2020).
      \file{SraRunInfo.csv} was obtained from the NCBI Sequence Read Archive (SRA) (\url{https://www.ncbi.nlm.nih.gov/sra/?term=PRJNA251688}).
      \file{sample_data.csv} has data obtained from NCBI BioSample records for BioProject PRJNA251688.
      \file{*.png} are several plots created while running the DADA2 pipeline.
      \file{ps_FEN+22.rds} contains the phyloseq object with (including \code{otu_table}, \code{sample_data}, and \code{refseq} objects) created at the end of the DADA2 pipeline.
    }
  }

  \describe{
    \item{\file{ZFZ+23}}{
      Analysis of data from Zhang et al. (2023) for hot springs in the Qinghai-Tibet Plateau.
      \file{pipeline.R} has the commands used to process the 16S rRNA gene sequence data and was adapted by Jeffrey Dick from the DADA2 pipeline tutorial (Callahan, 2020).
      \file{SraRunInfo.csv} was obtained from the NCBI Sequence Read Archive (SRA) (\url{https://www.ncbi.nlm.nih.gov/sra/?term=PRJNA860942}).
      \file{sample_data.csv} has data obtained from NCBI BioSample records for BioProject PRJNA860942.
      \file{*.png} are several plots created while running the DADA2 pipeline.
      \file{ps_ZFZ+23.rds} contains the phyloseq object with (including \code{otu_table}, \code{sample_data}, and \code{refseq} objects) created at the end of the DADA2 pipeline.
    }
  }

  \describe{
    \item{\file{mouse/pipeline.R}}{
      Pipeline adapted from the DADA2 pipeline tutorial (Callahan, 2020) for processing the mouse example dataset (see \code{\link{chem16S-data}}).
    }
  }

}

\references{

Bowen De León K, Gerlach R, Peyton BM, Fields MW. 2013. Archaeal and bacterial communities in three alkaline hot springs in Heart Lake Geyser Basin, Yellowstone National Park. \emph{Frontiers in Microbiology} \bold{4}: 330. \doi{10.3389/fmicb.2013.00330}

Callahan B. 2020. DADA2 Pipeline Tutorial (1.16). \url{https://benjjneb.github.io/dada2/tutorial.html}, accessed on 2023-06-14.

Dick JM, Meng D. 2023. Community- and genome-based evidence for a shaping influence of redox potential on bacterial protein evolution. \emph{mSystems} \bold{8}(3): e00014-23. \doi{10.1128/msystems.00014-23}

Dick JM, Tan J. 2023. Chemical links between redox conditions and estimated community proteomes from 16S rRNA and reference protein sequences. \emph{Microbial Ecology} \bold{85}: 1338--1355. \doi{10.1007/s00248-022-01988-9}

Fonseca A, Espinoza C, Nielsen LP, Marshall IPG, Gallardo VA. 2022. Bacterial community of sediments under the Eastern Boundary Current System shows high microdiversity and a latitudinal spatial pattern. \emph{Frontiers in Microbiology} \bold{13}: 1016418. \doi{10.3389/fmicb.2022.1016418}

Herlemann DPR, Lundin D, Andersson AF, Labrenz M, Jürgens K. 2016. Phylogenetic signals of salinity and season in bacterial community composition across the salinity gradient of the Baltic Sea. \emph{Frontiers in Microbiology} \bold{7}: 1883. \doi{10.3389/fmicb.2016.01883}

O'Leary NA et al. 2016. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. \emph{Nucleic Acids Research} \bold{44}: D733-D745. \doi{10.1093/nar/gkv1189}

Parks DH, Chuvochina M, Rinke C, Mussig AJ, Chaumeil P-A, Hugenholtz P. 2022. GTDB: an ongoing census of bacterial and archaeal diversity through a phylogenetically consistent, rank normalized and complete genome-based taxonomy. \emph{Nucleic Acids Research} \bold{50}: D785--D794. \doi{10.1093/nar/gkab776}

Swingley WD, Meyer-Dombard DR, Shock EL, Alsop EB, Falenski HD, Havig JR, Raymond J. 2012. Coordinating environmental genomics and geochemistry reveals metabolic transitions in a hot spring ecosystem. \emph{PLOS One} \bold{7}(6): e38108. \doi{10.1371/journal.pone.0038108}

Zhang H-S, Feng Q-D, Zhang D-Y, Zhu G-L, Yang L. 2023. Bacterial community structure in geothermal springs on the northern edge of Qinghai-Tibet plateau. \emph{Frontiers in Microbiology} \bold{13}: 994179. \doi{10.3389/fmicb.2022.994179}

}
