#!/usr/bin/perl -w use strict; #This perl script serves as an example script to remove reads from fastq files that have matches in a contaminant database, based on filtered ssaha2 output. #USAGE: perl ssaha_match_removal_script.pl my_fastq.fq my_ssaha2_contaminant_screen_results.txt my_filtered_fastq_output.fq my $fastq = shift @ARGV or die; my $ssaha2_result = shift @ARGV or die; my $out = shift @ARGV or die; #create a hash with the sequence IDs to be removed (significant matches from ssaha2 search against contaminant database) my %remove = (); open ( my $RM, $ssaha2_result ) or die "can't open $ssaha2_result\n"; while ( my $rline = <$RM> ){ chomp $rline; my @rarray = split /\s+/, $rline; $remove{$rarray[2]} = 1; } #remove sequences with contaminant matches from fastq file open ( my $FQ, $fastq ) || die "can't open $fastq\n"; open ( my $OUT, ">$out" ) || die "can't open $out\n"; my $seq_counter = 0; my $counter = 0; my $read; my $end; my $nuc_seq; my $qual_seq; while ( my $line = <$FQ> ){ chomp $line; $counter++; #this regular expression is too specific to our libraries to be made generally available if ( $line =~ m/^@(SN\S+)\s(\d\S+)/ ){ $read = $1; $end = " ".$2; #example sequence ID from read pair #1: @SN1163:57:D0RB4ACXX:5:1101:1320:2072 1:N:0:ATGTCAG #example sequence ID from read pair #2: @SN1163:57:D0RB4ACXX:5:1101:1320:2072 2:N:0:ATGTCAG $seq_counter++; } elsif ( ($counter - 2)/4 == $seq_counter -1 ){ $nuc_seq = $line; } elsif ( ($counter - 3)/4 == $seq_counter -1 ){ next; } elsif ( ($counter - 4)/4 == $seq_counter -1) { $qual_seq = $line; } else { warn "line $line doesn't conform to hash specifications (counter is at $counter, seq counter at $seq_counter)\n"; } #start processing reads individually my $def = "@"; if ( ($counter - 4)/4 == $seq_counter -1){ unless ( defined $remove{$read} ){ print $OUT "$def$read$end\n$nuc_seq\n+\n$qual_seq\n"; } } } my $check = $counter/$seq_counter; print "number of sequences read: $seq_counter; if this number isn't 4, something weird is going on with fastq file: $check\n";