#SUPPLEMENTAL FILE: COMMAND LINE SUGGESTIONS TO PERFORM DE NOVO ASSEMBLY, ANNOTATION AND DIFFERENTIAL EXPRESSION ANALYSIS OF ILLUMINA RNA-SEQ DATA #For these example command lines, we assume that you have unzipped, paired fastq files from an Illumina sequencer. We only provide command lines for 1 library - note that for a regular experiment, you will have at least 6 (2 conditions with 3 replicates per experiment). We assume that all of the programs listed in Table 1 are downloaded and in your path. Here, we only detail steps that need to be accomplished via command line (including R commands); other intervening steps may be necessary (e.g. FastQC checks of read quality before and after read cleaning; generating a transcript-to-gene-map for RSEM). #All comments in this document are preceded by a # sign, in order to distinguish them from commands. #If libraries were split across two Illumina lanes, identify matching fastq files and concatenate them, e.g.: cat fastq_split_a_pair1.fq fastq_split_b_pair1.fq > fastq_pair1.fq cat fastq_split_a_pair2.fq fastq_split_b_pair2.fq > fastq_pair2.fq #ssaha2. Perform these commands for all libraries. ssaha2Build -rtype solexa -save my_basename my_reference.fa ssaha2 -rtype solexa -identity 95 -best 1 -save my_basename fastq_pair1.fq > fastq1_v_ref_ssaha2.out grep ALIGNMENT fastq1_v_ref_ssaha2.out > fastq1_v_ref_ssaha2.out.short #We recommend removing both read pairs from a contaminant match, even if only one read in the pair matched the contaminant. To do this step, combine the ssaha2 output from the 1st and 2nd reads in the pair from the same library before the read removal step: cat fastq1_v_ref_ssaha2.out.short fastq2_v_ref_ssaha2.out.short > fastq_v_ref_ssaha2.out.short #We provide a representative perl script for removing reads from a fastq file with matches in the ssaha2 filtered output (Supplemental File 2). This script serves as an example, and may not work with your datafiles - in particular due to the regular expression on l. 33 - but may be modified to suit your own needs. perl ssaha_match_removal_script.pl test_reads1.fq fastq_v_ref_ssaha2.out.short test_reads1_filtered.fq #SolexaQA: DynamicTrim and LengthSort. If you specify both read pair fastq files in the LengthSort.pl script, the script will sort reads into 'paired' and 'single' reads - this is important for downstream steps that require that reads in each fastq file are exactly paired between files. perl DynamicTrim.pl fastq_pair1_filtered.fq fastq_pair2_filtered.fq -h 20 perl LengthSort.pl fastq_pair1_filtered.fq.trimmed fastq_pair2_filtered.fq.trimmed #Digital normalization via khmer. #For Illumina fastq format 1.8, it is necessary to run the interleave.py script provided in the 'sandbox' directory: python interleave.py fastq_pair1_filtered.fq.trimmed.paired1 fastq_pair2_filtered.fq.trimmed.paired2 > fastq_filtered.fq.trimmed.paired.interleaved #Note - the command below should use 16Gb of RAM. See http://khmer.readthedocs.org/en/latest/choosing-table-sizes.html for an explanation of how to vary the -N and -x parameters. -k determines the kmer size, and -C the coverage cutoff. The command will discard sequences if their median k-mer abundance lies above the specified coverage cutoff. normalize-by-median.py -p -k 20 -C 20 -N 4 -x 1e9 fastq_filtered.fq.trimmed.paired.interleaved #Trinity de novo assembly of the normalized, interleaved read pairs (from all libraries): Trinity.pl --seqType fq --min_kmer_cov 2 --JM 256G --run_as_paired --CPU 24 --single fastq_filtered.fq.trimmed.paired.interleaved.keep #Note - the parameter --min_kmer_cov 2 will reduce memory usage, but will discard kmers with a coverage of less than 2. The --CPU parameter designates the number of CPUs used in the program - if you have more CPUs available, you can increase this parameter. #Assess the quality of the assembly: perl assemblathon_stats.pl trinity_out.fa #BLASTX annotation of de novo transcriptome assembly. Here, we use Drosophila melanogaster proteins as our reference. Note that '-num_threads 12' specifies that 12 cpus will be used - modify this parameter if necessary. The flag '-outfmt 6' generates a tab-formatted output with the following columns: 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' (see Blastx documentation for more information). The e-value cutoff of 1e-3 can be reduced if closely related taxa are being aligned (e.g. 1e-6 if comparing A. albopictus with A. aegypti). makeblastdb -parse_seqids -dbtype prot -in dmel-all-translation-r5.54.fasta blastx -query trinity_out.fa -db dmel-all-translation-r5.54.fasta -evalue 1e-3 -num_alignments 1 -num_threads 12 -outfmt 6 -out my_blastx_output.txt #Create a reference dataset from the transcriptome fasta file rsem-prepare-reference --transcript-to-gene-map my_transcript_to_gene_map.txt --no-polyA trinity_out.fa my_trinity_assembly #Calculate expression. Note that '--num_threads 12' specifies that 12 cpus will be used - modify this parameter if necessary. rsem-calculate-expression --paired-end --num-threads 12 fastq_pair1_filtered.fq.trimmed.paired1 fastq_pair2_filtered.fq.trimmed.paired2 my_trinity_assembly my_rsem_analysis_lib1 #Make matrix from expression results from all libraries. perl rsem-generate-data-matrix my_rsem_analysis_lib1.genes.results my_rsem_analysis_lib2.genes.results > my_rsem_output.txt #EdgeR commands to generate differential expression data (these should be performed in the R environment): #Note: we highly recommend reading the User's guide before analyzing your data. EdgeR has many options and parameters which are explained in detail here: #http://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf. #process data data <- read.delim("my_rsem_output.txt", row.names=1) data <- round(data) expr <- rowSums(cpm(data)>1) >= 3 data_expr <- data[expr,]) #Generate a DGEList object (e.g., for a data matrix with three SD and three LD replicates: group <- c("SD", "SD", "SD", "LD", "LD", "LD") design <- model.matrix(~0+group) dgelist <- DGEList(counts=data_expr, group=group) #Perform TMM normalization dgelist <- calcNormFactors(dgelist) #Estimate the common and tagwise dispersions of the data dgelist <- estimateCommonDisp(dgelist) dgelist <- estimateTagwiseDisp(dgelist) #Get a list of genes with a Benjamini-Hochberg corrected p-value < 0.05 et <- exactTest(dgelist) all <- topTags(et, n = dim(dgelist $counts)[1]) de <- subset(all$table, all$table$FDR<0.05)) #Plot the distribution of log-fold-change vs. abundance detags <- rownames(de) plotSmear(et, de.tags=detags)