Script_Toybox/Run_DADA2.R at master · jwestrob/Script_Toybox · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
if (!require("pacman")) install.packages("pacman")
pacman::p_load(argparse, dplyr, hash, ggplot2, dada2, phyloseq, stringr, seqinr)
require(Biostrings)

#load argument parser
parser <- ArgumentParser(description="Run the DADA2 pipeline on a given set of FASTQ sequences (unfiltered). Gives you a PDF with composition barplots and alpha diversity (-p) and a summary table containing taxon abundances (-s) as well as some other optional things if you feelin frisky. Please ensure that your fastq files end with _R1_001.fastq/_R2_001.fastq; otherwise go in and change the 'pattern' argument on lines 43/44.")

#Specify options
parser$add_argument("-s", "--sequences", type="character", nargs=1,
    help="Path to fastq files (Ideally the full path to the folder containing all your files).")
parser$add_argument("-st", "--summarytable", type="character", nargs=1,
    help="Name for your summary table (.tsv will be appended).")
parser$add_argument("-p", "--pdfout", type="character", nargs=1,
    help="PDF outfile name for plots.")
parser$add_argument("-ps", "--phyloseq", type="character", nargs=1,
    help="Name of file to save phyloseq object to. include .rds extenison.")
parser$add_argument("-f", "--fastaprefix", type="character", nargs=1,
    help="Name for fasta output files (used as prefix for aligned and unaligned FASTAs). REQUIRED")
parser$add_argument("-RT", "--TRAIN", type="character", nargs=1,
    help="Path to SILVA training data. REQUIRED")
parser$add_argument("-RS", "--SPECIES", type="character", nargs=1,
    help="Path to SILVA species-level taxonomy data. REQUIRED")
parser$add_argument("-t", "--taxa", type="character", default=NULL,
    help="Optional: Print original taxa table (ASV sequence format). Include .tsv extension.")
parser$add_argument("-o", "--otu", type="character", default=NULL,
    help="OPTIONAL: Print original OTU/ASV table (ASV sequence format). Include .tsv extension.")


#load arguments as variables (in array "args")
args <- parser$parse_args()

fastqpath <- args$sequences
fastaprefix <- args$fastaprefix
summarytable <- args$summarytable
pdfout <- args$pdfout
silva_train <- args$TRAIN
silva_species <- args$SPECIES
phylo_file <- args$phyloseq

if(length(args$taxa) == 0){
  tax = FALSE
}else{
  tax = TRUE
}
if(length(args$otu) == 0){
  otu = FALSE
}else{
  otu = TRUE
}

if(tax){
  tax_table = args$taxa
}else{
  tax_table = NULL
}
if(otu){
  otu_table = args$otu
}else{
  otu_table = NULL
}

########################################
#              READ DATA               #
########################################

# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(fastqpath, pattern="PE.1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(fastqpath, pattern="PE.2.fastq.gz", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
pdf(pdfout)
print(plotQualityProfile(fnFs) + ggtitle('Forward read quality profiles.'))
print(plotQualityProfile(fnRs) + ggtitle('Reverse read quality profiles.'))

filt_path <- file.path(fastqpath, "filtered") # Place filtered files in filtered/ subdirectory
filtFs <- file.path(filt_path, paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(filt_path, paste0(sample.names, "_R_filt.fastq.gz"))

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
              maxN=0, maxEE=c(5,5), truncQ=2, rm.phix=TRUE,
              compress=TRUE, multithread=15) # On Windows set multithread=FALSE

#Perform error rate estimation

errF <- learnErrors(filtFs, multithread=15)
errR <- learnErrors(filtRs, multithread=15)

print(plotErrors(errF, nominalQ=TRUE) + ggtitle('Error rates - Forward reads'))
print(plotErrors(errR, nominalQ=TRUE) + ggtitle('Error rates - Reverse reads'))

derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)
# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names

dadaFs <- dada(derepFs, err=errF, multithread=15)
dadaRs <- dada(derepRs, err=errR, multithread=15)

mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)

seqtab <- makeSequenceTable(mergers)

#print("Sequence length distribution: ", table(nchar(getSequences(seqtab))))

seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim")
rownames(track) <- sample.names

taxa <- assignTaxonomy(seqtab.nochim, silva_train, multithread=15)
taxa <- addSpecies(taxa, silva_species)

samples.out <- rownames(seqtab.nochim)

ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE),
               tax_table(taxa))

print(plot_richness(ps, measures=c("Shannon", "Simpson")) + theme_bw() + ggtitle("Alpha diversity; Shannon and Simpson"))

#ord.PCA.bray <- ordinate(ps, method="NMDS", distance="manhattan")

#print(plot_ordination(ps, ord.PCA.bray, title="Bray PCA") + theme_bw() + ggtitle("NMDS - Manhattan Distance"))

top20 <- names(sort(taxa_sums(ps), decreasing=TRUE))[1:20]
top80 <- names(sort(taxa_sums(ps), decreasing=TRUE))[1:80]

ps.alltaxa <- transform_sample_counts(ps, function(OTU) OTU/sum(OTU))

#Plot non-normalized top 20 Genera
ps.top20 <- prune_taxa(top20, ps.alltaxa)
print(plot_bar(ps.top20, fill="Genus") + theme_bw() + ggtitle("Non-normalized composition of top 20 genera") + theme(legend.key.size = unit(2, "mm"), axis.text.x = element_text(angle = 90, hjust = 1)))

#Plot non-normalized top 20 Families
print(plot_bar(ps.top20, fill="Family") + theme_bw() + ggtitle("Non-normalized composition of top 20 families") + theme(legend.key.size = unit(2, "mm"), axis.text.x = element_text(angle = 90, hjust = 1)))

#Normalize top 20 and plot
ps.top20 <- transform_sample_counts(ps.top20, function(OTU) OTU/sum(OTU))
plot_bar(ps.top20, fill="Genus") + ggtitle("Normalized composition of top 20 genera") +
    guides(fill=guide_legend(nrow=40,byrow=TRUE)) + theme_bw() +
    theme(legend.key.size = unit(2, "mm"), axis.text=element_text(size=6), axis.title=element_text(size=14,face="bold"), axis.text.x = element_text(angle = 90, hjust = 1))
plot_bar(ps.top20, fill="Family") + ggtitle("Normalized composition of top 20 genera") +
    guides(fill=guide_legend(nrow=40,byrow=TRUE)) + theme_bw() +
    theme(legend.key.size = unit(2, "mm"), axis.text=element_text(size=6), axis.title=element_text(size=14,face="bold"), axis.text.x = element_text(angle = 90, hjust = 1))


#Plot non-normalized top 80 Genera
ps.top80 <- prune_taxa(top80, ps.alltaxa)
print(plot_bar(ps.top80, fill="Genus") + theme_bw() + ggtitle("Non-normalized composition of top 80 genera") +
    theme(legend.key.size = unit(2, "mm"), axis.text.x = element_text(angle = 90, hjust = 1)) + guides(fill=guide_legend(nrow=40, byrow=TRUE)))
print(plot_bar(ps.top80, fill="Family") + theme_bw() + ggtitle("Non-normalized composition of top 80 families") +
    theme(legend.key.size = unit(2, "mm"), axis.text.x = element_text(angle = 90, hjust = 1)) + guides(fill=guide_legend(nrow=40, byrow=TRUE)))

#Normalize top 80 and plot
ps.top80 <- transform_sample_counts(ps.top80, function(OTU) OTU/sum(OTU))

plot_bar(ps.top80, fill="Genus") + ggtitle("Normalized composition of top 80 genera") +
    guides(fill=guide_legend(nrow=40,byrow=TRUE)) + theme_bw() +
    theme(legend.key.size = unit(2, "mm"), axis.text=element_text(size=6), axis.title=element_text(size=14,face="bold"), axis.text.x = element_text(angle = 90, hjust = 1))
plot_bar(ps.top80, fill="Family") + ggtitle("Normalized composition of top 80 genera") +
    guides(fill=guide_legend(nrow=40,byrow=TRUE)) + theme_bw() +
    theme(legend.key.size = unit(2, "mm"), axis.text=element_text(size=6), axis.title=element_text(size=14,face="bold"), axis.text.x = element_text(angle = 90, hjust = 1))

Species <- names(sort(taxa_sums(ps), decreasing=TRUE))
ps.Species <- transform_sample_counts(ps, function(OTU) OTU/sum(OTU))

print(plot_bar(ps.Species, fill="Species") + theme_bw() + ggtitle("All species (normalized)") +
    theme(legend.key.size = unit(2, "mm"), axis.text.x = element_text(angle = 90, hjust = 1)) + guides(fill=guide_legend(nrow=40, byrow=TRUE)))

if(tax){
  #Convert tax_table to matrix
  tax_mat = as(tax_table(ps), 'matrix')

  #Write that nonsense down
  write(tax_mat, file=tax_table, sep='\t')
}
if(otu){
  #Same with OTU table
  otu_mat = as(otu_table(ps), 'matrix')

  #Write that nonsense down again
  write(tax_mat, file=otu_table, sep='\t')
}

saveRDS(ps, phylo_file)

h <- hash()
for(rowname in row.names(tax_mat)){
    taxonomy_nonsense = sapply(colnames(tax_mat), function(x) paste(tax_mat[rowname, x], sep=';'))
    tax_info = paste(taxonomy_nonsense, collapse=';')
    h[rowname] = tax_info
}

new_header <- lapply(colnames(otu_mat), function(x) h[[x]])
otu_mat_newheader = otu_mat
colnames(otu_mat_newheader) = new_header

Total = c()
for(i in 1:nrow(otu_mat_newheader)){
    sum = sum(otu_mat_newheader[i,])
    Total[[i]] = sum
}

otu_mat_wsums = cbind(Total, otu_mat_newheader)
#print(head(otu_mat_wsums))
otu_mat_wsums = cbind(Sample_ID = rownames(otu_mat_wsums), otu_mat_wsums)

for(i in 1:length(colnames(otu_mat_wsums))){
    colnames(otu_mat_wsums)[i] = paste(strsplit(colnames(otu_mat_wsums)[[i]],'.',fixed=TRUE)[[1]],collapse=';')
}

rownames(otu_mat_wsums) <- NULL

to_save <- data.frame(otu_mat_wsums)
write.table(to_save, file=summarytable, sep='\t', quote=FALSE)
dev.off()

aln_df = data.frame()
for(i in keys(h)){
    aln_df[i,"16S"] = i
    aln_df[i,"Tax"] = h[[i]]
}
rownames(aln_df) <- NULL
head(aln_df)

seqs <- list()
tax <- list()
#Uniquify taxonomy IDs for later processing with iq-tree
tax_char <- lapply(aln_df["Tax"], function(x) as.character(x))

wombo <- make.unique(as.character(unlist(tax_char)))
print(wombo)
aln_df["New_Tax"] <- wombo

for(i in 1:nrow(aln_df["16S"])){
    seqs[i] <- aln_df[i, "16S"]
    tax[i] <- aln_df[i, "New_Tax"]
}

#Remove list comprehension
seqs <- unlist(seqs)
#Turn into biostrings-friendly format
seqs <- sapply(seqs, function(x) x <- DNAString(x))

#Save seqeuences as fasta
unaligned <- paste0(fastaprefix, '_unaligned.fa', sep='')
write.fasta(seqs, tax, unaligned, open='w')
#Align with MUSCLE
aligned <- paste0(fastaprefix, '_ALIGNED.fa', sep='')
command <- paste('muscle -in ', unaligned, ' -out ', aligned, sep='')
system(command)