## This is a python script that takes fastq forward, reverse and barcode files and analyses them ## using qiime2. ## The final results are Greenegenes classified taxonomic table, beta diversity distance and PCoA import os ## Those are the functions created for the anaylsis. # creates and Greenegens classifier specifically fitted to our data - # the 515F/806R primmer set and 175bp long reads. # the Greenegene version is 8_13 and identity is 99 def GG_classifier_train(classifier): # load dowloaded GG_13_8 data os.system('qiime tools import \ --type \'FeatureData[Sequence]\' \ --input-path 99_otus.fasta \ --output-path gg_otus.qza') os.system('qiime tools import \ --type \'FeatureData[Taxonomy]\' \ --source-format HeaderlessTSVTaxonomyFormat \ --input-path 99_otu_taxonomy.txt \ --output-path gg_ref-taxonomy.qza') # fitting the classifier to our specific data - # Cutting to read length and fitting primers. # The primers used in the qiime2 moving pictures tutorial are 515F/806R, the same as we used here. os.system('qiime feature-classifier extract-reads \ --i-sequences gg_otus.qza \ --p-f-primer GTGCCAGCMGCCGCGGTAA \ --p-r-primer GGACTACHVGGGTWTCTAAT \ --p-trunc-len 175 \ --o-reads gg_ref-seqs.qza') # training the classifier os.system('qiime feature-classifier fit-classifier-naive-bayes \ --i-reference-reads gg_ref-seqs.qza \ --i-reference-taxonomy gg_ref-taxonomy.qza \ --o-classifier ' + classifier) # import the fastq files into a qza file qiime2 can work with # the forward.fastq.gz, reverse.fastq.gz and barcodes.fastq.gz files need to be in the # input_path folder (qiime2 needs the files to have those specific names) def import_data_to_qiime2(input_path, out_path, verbose = False): os.system('qiime tools import \ --type EMPPairedEndSequences folder \ --input-path ' + input_path + ' \ --output-path ' + out_path + 'emp-paired-end-sequences.qza') if verbose: print('import complete') return # demultiplex and pair the paired-end reads def demultiplex(out_path, map_file, verbose = False ): # demultiplex os.system('qiime demux emp-paired \ --i-seqs ' + out_path + 'emp-paired-end-sequences.qza \ --m-barcodes-file ' + map_file + ' \ --p-rev-comp-barcodes \ --m-barcodes-category BarcodeSequence \ --o-per-sample-sequences ' + out_path + 'demux.qza') # creates a .qzv visualization file of the results. # the visualizations can be viewed using the qiime2 # "qiime tools view" command os.system('qiime demux summarize \ --i-data ' + out_path + 'demux.qza \ --o-visualization ' + out_path + 'demux.qzv') # os.system('qiime tools view ' + out_path + 'demux.qzv') if verbose: print('demultiplex complete') return # runs the demultiplexed data through DADA2. # This performs quality control, truncates reads and removes bad # quality reads and chimeras. Creates a feature table with clean SVs # and a list of representative sequences (for all SVs that passed quality control) def DADA2(out_path, map_file, trim_left = 10, trunc_len = 160, verbose = False): # Runs DADA2 - quality control + creates SVs table and list os.system('qiime dada2 denoise-paired \ --i-demultiplexed-seqs ' + out_path + 'demux.qza \ --p-trim-left-f ' + str(trim_left) + ' \ --p-trunc-len-f ' + str(trunc_len) + ' \ --p-trim-left-r ' + str(trim_left) + ' \ --p-trunc-len-r ' + str(trunc_len) + ' \ --o-representative-sequences ' + out_path + 'rep-seqs.qza \ --o-table ' + out_path + 'table.qza') # creates a .qzv visualization files of the results. # the visualizations can be viewed using the qiime2 # "qiime tools view" command# visualize os.system('qiime feature-table summarize \ --i-table ' + out_path + 'table.qza \ --o-visualization ' + out_path + 'table.qzv \ --m-sample-metadata-file ' + map_file) # os.system('qiime tools view ' + out_path + '/downstream_files/table.qzv') os.system('qiime feature-table tabulate-seqs \ --i-data ' + out_path + 'rep-seqs.qza \ --o-visualization ' + out_path + 'rep-seqs.qzv') if verbose: print('DADA2 complete') return # Performs a phylogenetic analysis of the SVs. The resulting tree is used to calculate # core-metrics of various alpha and beta diversity measures that will be later used # each sample is rarefied to rar_num reads at this stage to avoid sample size bias def phylogenetic_analysis(out_path, rar_num = 2000, verbose = False): # phylogenetic diversity analysis os.system('qiime alignment mafft \ --i-sequences ' + out_path + 'rep-seqs.qza \ --o-alignment ' + out_path + 'aligned-rep-seqs.qza') os.system('qiime alignment mask \ --i-alignment ' + out_path + 'aligned-rep-seqs.qza \ --o-masked-alignment ' + out_path + 'masked-aligned-rep-seqs.qza') os.system('qiime phylogeny fasttree \ --i-alignment ' + out_path + 'masked-aligned-rep-seqs.qza \ --o-tree ' + out_path + 'unrooted-tree.qza') os.system('qiime phylogeny midpoint-root \ --i-tree ' + out_path + 'unrooted-tree.qza \ --o-rooted-tree ' + out_path + 'rooted-tree.qza') # calculates core-metrics of various alpha and beta diversity measures # using the calculated rooted tree os.system('qiime diversity core-metrics \ --i-phylogeny ' + out_path + 'rooted-tree.qza \ --i-table ' + out_path + 'table.qza \ --p-sampling-depth ' + str(rar_num) + ' \ --output-dir ' + out_path + 'core-metrics-results') if verbose: print('phylogenetic analysis complete') return # makes beta diversity based PCoA plot, here uses unweighted_unifrac def beta_div( out_path, map_file, verbose = False): os.system('qiime emperor plot \ --i-pcoa ' + out_path + 'core-metrics-results/unweighted_unifrac_pcoa_results.qza \ --m-metadata-file ' + map_file + ' \ --o-visualization ' + out_path + 'core-metrics-results/unweighted-unifrac-emperor.qzv') if verbose: print('beta diversity complete') return # classifies SVs to taxonomy using the pre-trained classifier. makes taxonomy tables at all levels # and taxonomic interactive barplots (taxa-bar-plots.qzv) def taxa_analysis(path, map_file, verbose = False, classifier = 'gg_classifier.qza'): # classifies feathures (SVs) to taxa using the pre-trained classifier os.system('qiime feature-classifier classify-sklearn \ --i-classifier ' + classifier + ' \ --i-reads ' + path + 'rep-seqs.qza \ --o-classification ' + path + 'taxonomy.qza') # creating a visualization of the resulting table (.qzv file) os.system('qiime metadata tabulate \ --m-input-file ' + path + 'taxonomy.qza \ --o-visualization ' + path + 'taxonomy.qzv') # making an interactive taxonomic barplot os.system('qiime taxa barplot \ --i-table ' + path + 'table.qza \ --i-taxonomy ' + path + 'taxonomy.qza \ --m-metadata-file ' + map_file + ' \ --o-visualization ' + path + 'taxa-bar-plots.qzv') if verbose: print('taxa analysis complete') return # Does the upstream analysis from fastq files to DADA2 output def qiime2_fastq_to_DADAtable(input_path, out_path, map_file, trim_left = 10, trunc_len = 160, verbose = True ): import_data_to_qiime2(input_path, out_path, verbose ) demultiplex(out_path, map_file, verbose ) DADA2(out_path, map_file, trim_left, trunc_len, verbose) # Does the downstream analysis from DADA2 output to beta diversity and taxonomic classification results. def qiime2_table_to_beta_taxa(out_path, map_file, rar_num, verbose = True, classifier = 'gg_classifier.qza'): phylogenetic_analysis(out_path, rar_num, verbose) beta_div( out_path, map_file, verbose) taxa_analysis(out_path, map_file, verbose, classifier) ## This code execute the above functions and performs the specific analysis # train the classifier classifier = 'gg_classifier.qza' GG_classifier_train(classifier) # set parameters rar_num = '2146' # number of reads at rarefaction trim_left = 13 # number of nucleotides to trim from left during QC trunc_len = 160 # Where to trunc the read during QC verbose = True # True for a textual verification of progress # set and run the first run fastq files, up to creating DADA2 features table etc. input_path_db1 = 'data/db1/emp-paired-end-sequences' out_path_db1 = 'results/db1/' map_file = 'map_DB1.txt' # run data 1 os.system('mkdir ' + out_path_db1) qiime2_fastq_to_DADAtable(input_path_db1, out_path_db1, map_file, trim_left, trunc_len ) # set and run the second run fastq files, up to creating DADA2 features table etc. input_path_db2 = 'data/db2/emp-paired-end-sequences' out_path_db2 = 'results/db2/' map_file = 'map_db2.txt' # run data 2 os.system('mkdir ' + out_path_db2) qiime2_fastq_to_DADAtable(input_path_db2, out_path_db2, map_file, trim_left, trunc_len ) # merge DADA2 results from the two separate runs out_path = 'results/merge/' map_file = 'map.txt' os.system('mkdir ' + out_path) os.system('qiime feature-table merge \ --i-table1 ' + out_path_db1 + 'table.qza \ --i-table2 ' + out_path_db2 + 'table.qza \ --o-merged-table ' + out_path + 'table.qza') os.system('qiime feature-table summarize \ --i-table ' + out_path + 'table.qza \ --o-visualization ' + out_path + 'table.qzv \ --m-sample-metadata-file ' + map_file) os.system('qiime feature-table merge-seq-data \ --i-data1 ' + out_path_db1 + 'rep-seqs.qza \ --i-data2 ' + out_path_db2 + 'rep-seqs.qza \ --o-merged-data ' + out_path + 'rep-seqs.qza') os.system('qiime feature-table tabulate-seqs \ --i-data ' + out_path + 'rep-seqs.qza \ --o-visualization ' + out_path + 'rep-seqs.qzv') # analyze the merged data qiime2_table_to_beta_taxa(out_path, map_file, rar_num, verbose, classifier)