## This is a python script that takes fastq forward, reverse and barcode files and analyses them 
## using qiime2. 
## The final results are Greenegenes classified taxonomic table, beta diversity distance and PCoA 

import os

## Those are the functions created for the anaylsis.

# creates and Greenegens classifier specifically fitted to our data - 
# the 515F/806R primmer set and 175bp long reads.
# the Greenegene version is 8_13 and identity is 99
def GG_classifier_train(classifier):
	# load dowloaded GG_13_8 data
	os.system('qiime tools import \
  --type \'FeatureData[Sequence]\' \
  --input-path 99_otus.fasta \
  --output-path gg_otus.qza')

	os.system('qiime tools import \
  --type \'FeatureData[Taxonomy]\' \
  --source-format HeaderlessTSVTaxonomyFormat \
  --input-path 99_otu_taxonomy.txt \
  --output-path gg_ref-taxonomy.qza')

	# fitting the classifier to our specific data - 
	# Cutting to read length and fitting primers. 
	# The primers used in the qiime2 moving pictures tutorial are 515F/806R, the same as we used here.
	os.system('qiime feature-classifier extract-reads \
  --i-sequences gg_otus.qza \
  --p-f-primer GTGCCAGCMGCCGCGGTAA \
  --p-r-primer GGACTACHVGGGTWTCTAAT \
  --p-trunc-len 175 \
  --o-reads gg_ref-seqs.qza')
	
	# training the classifier
	os.system('qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads gg_ref-seqs.qza \
  --i-reference-taxonomy gg_ref-taxonomy.qza \
  --o-classifier ' + classifier)

# import the fastq files into a qza file qiime2 can work with
# the forward.fastq.gz, reverse.fastq.gz and barcodes.fastq.gz files need to be in the 
# input_path folder (qiime2 needs the files to have those specific names)
def import_data_to_qiime2(input_path, out_path, verbose = False):
	os.system('qiime tools import \
  --type EMPPairedEndSequences folder \
  --input-path ' + input_path + ' \
  --output-path ' + out_path + 'emp-paired-end-sequences.qza')
	if verbose:
		print('import complete')
	return

# demultiplex and pair the paired-end reads
def demultiplex(out_path, map_file, verbose = False ):
	# demultiplex
	os.system('qiime demux emp-paired \
  --i-seqs ' + out_path + 'emp-paired-end-sequences.qza \
  --m-barcodes-file ' + map_file + ' \
  --p-rev-comp-barcodes \
  --m-barcodes-category BarcodeSequence \
  --o-per-sample-sequences ' + out_path + 'demux.qza')
	
	# creates a .qzv visualization file of the results.
	# the visualizations can be viewed using the qiime2 
	# "qiime tools view" command
	os.system('qiime demux summarize \
  --i-data ' + out_path + 'demux.qza \
  --o-visualization ' + out_path + 'demux.qzv')	  
	# os.system('qiime tools view ' + out_path + 'demux.qzv')
	if verbose:
		print('demultiplex complete')
	return

# runs the demultiplexed data through DADA2.
# This performs quality control, truncates reads and removes bad
# quality reads and chimeras. Creates a feature table with clean SVs
# and a list of representative sequences (for all SVs that passed quality control)
def DADA2(out_path, map_file, trim_left = 10, trunc_len = 160, verbose = False):
	# Runs DADA2 - quality control + creates SVs table and list
	os.system('qiime dada2 denoise-paired \
  --i-demultiplexed-seqs ' + out_path  + 'demux.qza \
  --p-trim-left-f ' + str(trim_left) + ' \
  --p-trunc-len-f ' + str(trunc_len) + ' \
  --p-trim-left-r ' + str(trim_left) + ' \
  --p-trunc-len-r ' + str(trunc_len) + ' \
  --o-representative-sequences ' + out_path + 'rep-seqs.qza \
  --o-table ' + out_path + 'table.qza')
	
	# creates a .qzv visualization files of the results.
	# the visualizations can be viewed using the qiime2 
	# "qiime tools view" command# visualize
	os.system('qiime feature-table summarize \
  --i-table ' + out_path + 'table.qza \
  --o-visualization ' + out_path + 'table.qzv \
  --m-sample-metadata-file ' + map_file)
	# os.system('qiime tools view ' + out_path + '/downstream_files/table.qzv')

	os.system('qiime feature-table tabulate-seqs \
  --i-data ' + out_path + 'rep-seqs.qza \
  --o-visualization ' + out_path + 'rep-seqs.qzv')
	
	if verbose:
		print('DADA2 complete')
	return

# Performs a phylogenetic analysis of the SVs. The resulting tree is used to calculate 
# core-metrics of various alpha and beta diversity measures that will be later used
# each sample is rarefied to rar_num reads at this stage to avoid sample size bias
def phylogenetic_analysis(out_path, rar_num = 2000, verbose = False):
	# phylogenetic diversity analysis
	os.system('qiime alignment mafft \
  --i-sequences ' + out_path + 'rep-seqs.qza \
  --o-alignment ' + out_path + 'aligned-rep-seqs.qza')
	
	os.system('qiime alignment mask \
  --i-alignment ' + out_path + 'aligned-rep-seqs.qza \
  --o-masked-alignment ' + out_path + 'masked-aligned-rep-seqs.qza')
	
	os.system('qiime phylogeny fasttree \
  --i-alignment ' + out_path + 'masked-aligned-rep-seqs.qza \
  --o-tree ' + out_path + 'unrooted-tree.qza')
	
	os.system('qiime phylogeny midpoint-root \
  --i-tree ' + out_path + 'unrooted-tree.qza \
  --o-rooted-tree ' + out_path + 'rooted-tree.qza')
	
	# calculates core-metrics of various alpha and beta diversity measures
	# using the calculated rooted tree
	os.system('qiime diversity core-metrics \
  --i-phylogeny ' + out_path + 'rooted-tree.qza \
  --i-table ' + out_path + 'table.qza \
  --p-sampling-depth ' + str(rar_num) + ' \
  --output-dir ' + out_path + 'core-metrics-results')
	
	if verbose:
		print('phylogenetic analysis complete')
	return


# makes beta diversity based PCoA plot, here uses unweighted_unifrac
def beta_div( out_path, map_file, verbose = False):
	os.system('qiime emperor plot \
  --i-pcoa ' + out_path + 'core-metrics-results/unweighted_unifrac_pcoa_results.qza \
  --m-metadata-file ' + map_file + ' \
  --o-visualization ' + out_path + 'core-metrics-results/unweighted-unifrac-emperor.qzv')

	if verbose:
		print('beta diversity complete')
	return
	
# classifies SVs to taxonomy using the pre-trained classifier. makes taxonomy tables at all levels 
# and taxonomic interactive barplots (taxa-bar-plots.qzv)
def taxa_analysis(path, map_file, verbose = False, classifier = 'gg_classifier.qza'):
	# classifies feathures (SVs) to taxa using the pre-trained classifier
	os.system('qiime feature-classifier classify-sklearn \
  --i-classifier ' + classifier + ' \
  --i-reads ' + path + 'rep-seqs.qza \
  --o-classification ' + path + 'taxonomy.qza')
	
	# creating a visualization of the resulting table (.qzv file)
	os.system('qiime metadata tabulate \
  --m-input-file ' + path + 'taxonomy.qza \
  --o-visualization ' + path + 'taxonomy.qzv')
	
	# making an interactive taxonomic barplot
	os.system('qiime taxa barplot \
  --i-table ' + path + 'table.qza \
  --i-taxonomy ' + path + 'taxonomy.qza \
  --m-metadata-file ' + map_file + ' \
  --o-visualization ' + path + 'taxa-bar-plots.qzv')
	if verbose:
		print('taxa analysis complete')
	return
	
# Does the upstream analysis from fastq files to DADA2 output
def qiime2_fastq_to_DADAtable(input_path, out_path, map_file, trim_left = 10, trunc_len = 160, verbose = True ):
	import_data_to_qiime2(input_path, out_path, verbose )
	demultiplex(out_path, map_file, verbose )
	DADA2(out_path, map_file, trim_left, trunc_len, verbose)

# Does the downstream analysis from DADA2 output to beta diversity and taxonomic classification results.
def qiime2_table_to_beta_taxa(out_path, map_file, rar_num, verbose = True, classifier = 'gg_classifier.qza'):
	phylogenetic_analysis(out_path, rar_num, verbose)
	beta_div( out_path, map_file, verbose)
	taxa_analysis(out_path, map_file, verbose, classifier)

	
## This code execute the above functions and performs the specific analysis
	
# train the classifier
classifier = 'gg_classifier.qza'
GG_classifier_train(classifier)

# set parameters
rar_num = '2146' # number of reads at rarefaction
trim_left = 13 # number of nucleotides to trim from left during QC
trunc_len = 160 # Where to trunc the read during QC
verbose = True # True for a textual verification of progress

# set and run the first run fastq files, up to creating DADA2 features table etc.
input_path_db1 = 'data/db1/emp-paired-end-sequences'
out_path_db1 = 'results/db1/'
map_file = 'map_DB1.txt'
# run data 1
os.system('mkdir ' + out_path_db1)
qiime2_fastq_to_DADAtable(input_path_db1, out_path_db1, map_file, trim_left, trunc_len )

# set and run the second run fastq files, up to creating DADA2 features table etc.
input_path_db2 = 'data/db2/emp-paired-end-sequences'
out_path_db2 = 'results/db2/'
map_file = 'map_db2.txt'
# run data 2
os.system('mkdir ' + out_path_db2)
qiime2_fastq_to_DADAtable(input_path_db2, out_path_db2, map_file, trim_left, trunc_len )


# merge DADA2 results from the two separate runs
out_path = 'results/merge/'
map_file = 'map.txt'
os.system('mkdir ' + out_path)

os.system('qiime feature-table merge \
--i-table1 ' + out_path_db1 + 'table.qza \
--i-table2 ' + out_path_db2 + 'table.qza \
--o-merged-table ' + out_path + 'table.qza')

os.system('qiime feature-table summarize \
  --i-table ' + out_path + 'table.qza \
  --o-visualization ' + out_path + 'table.qzv \
  --m-sample-metadata-file ' + map_file)

os.system('qiime feature-table merge-seq-data \
--i-data1 ' + out_path_db1 + 'rep-seqs.qza \
--i-data2 ' + out_path_db2 + 'rep-seqs.qza \
--o-merged-data ' + out_path + 'rep-seqs.qza')

os.system('qiime feature-table tabulate-seqs \
  --i-data ' + out_path + 'rep-seqs.qza \
  --o-visualization ' + out_path + 'rep-seqs.qzv')

# analyze the merged data
qiime2_table_to_beta_taxa(out_path, map_file, rar_num, verbose, classifier)