#!/usr/bin/perl -w use strict; #Viewing and editing this script is easiest in a programming text editor set to interpret Perl. References to line numbers will only make sense using a programming editor (e.g. http://www.activestate.com/komodo-edit). #This script is designed to assess the 'Nuclei.csv' output file produced by Cell profiler using instructions from pipeline file 3_channels_pipeline.cppipe. It applies 2 separate gates to the nuclear antibody data (S780) and GFP-CDK2 reporter ratio values, respectively. It then creates a new file containing a copy of the original data with abbreviated column titles while adding a new column of labels relating to how each cell observation performed against the two gates. Also included is a record of the gate values entered by the user. #The value of using a script to perform this function is that it will work on files larger than programs such as Excel can handle. This is often the case for high-content, individual cell data. #On running the script a questionnaire will guide the user to supply filenames for input and output (remember to include .csv extensions) and the values for the two gates. #Column headers from the raw data are renamed for clarity in the new output file (see line 69). Like the gate-related labels, these column headers are used subsequently in R to select specific data. #The regular expression repeated on lines 75, 79, 83 and 87 pattern-matches individual cell data for the specific column format produced by Cell Profiler when using the 3_channels_pipeline.cppipe settings. Changes to selected output parameters will need these regular expression lines to be modified to accommodate the resulting changes to the organisation of the Cell Profiler Nuclei.csv output file. General notes on modifying the script are at the foot of this file. #Sub-routine used in questionnaire: sub get_line { print $_[0]; chomp (my $line = ); $line; } #Define 96well plate addresses in @plate A1-H12: my (@column, @row, @plate, $addr, $row, $column); @column = qw (1 2 3 4 5 6 7 8 9 10 11 12); @row = qw (A B C D E F G H); foreach $column (@column) { foreach $row (@row) { $addr = ("$row"."$column"); push (@plate, $addr); } } #Get user input via questionnaire: my ($source, $dest, $threshold, $threshold2); #threshold concerns P-S780RB1 ab, threshold 2 concerns GFP-CDK2 reporter if ($#ARGV == 3) { #Command line users can skip the questionnaire if they supply all 4 answers at the same time as running the script $source = $ARGV[0]; open IN, $source or die "Can't open '$source' for input: $!"; $dest = $ARGV[1]; die "Won't overwrite existing file" if -e $dest; open OUT, ">>$dest" #append destination file or die "Can't open '$dest' for output: $!"; $threshold = $ARGV[2]; $threshold2 = $ARGV[3]; } else { $source = &get_line("Which source file? "); open IN, $source or die "Can't open '$source' for input: $!"; $dest = &get_line("Which destination file? "); die "Won't overwrite existing file" if -e $dest; open OUT, ">>$dest" #append destination file or die "Can't open '$dest' for output: $!"; $threshold = &get_line("What S780 threshold? "); $threshold2 = &get_line("What GFP-CDK2 reporter threshold? "); } #Header line for output: print OUT "Well,Cell no,Frame no,Nuc Area,IntegBlue,MeanGFP,MeanRed,GFP_Ratio,Label,S780\>$threshold,GFP-CDK2Rep\>$threshold2\n"; my ($well); # The loop from lines 73-91 reads each line of data (i.e. each single cell data profile), uses the 2 gates to assign one of four labels to the current cell entry and writes a correspondingly modified line of data (still in comma-separated values) for each in the new OUTput file. # NB regular expression lines use (.*) for antibody intensity values in order to NOT exclude very low values output by Cell Profiler in scientific notation. foreach $_ () { seek(IN, 0, 0); if (/^\d{1,4},(\d{1,4}),Red,Blue,Green,(\d{1,2}),([A-H][0-9]+),[0-9]+,[A-H],(\d{1,4}),(\d*.\d*),(\d*.\d*),(.*),(\d*.\d*)\r*\n/ && $7 >= $threshold && $8 >= $threshold2) { #gates applied here can use gt > lt < or equal to == symbols my $newline = "$3,$1,$2,$4,$5,$6,$7,$8,"; print OUT $newline."P-S780\+ G1\n"} elsif # this line adds the gate label "P-S780+ G1" (/^\d{1,4},(\d{1,4}),Red,Blue,Green,(\d{1,2}),([A-H][0-9]+),[0-9]+,[A-H],(\d{1,4}),(\d*.\d*),(\d*.\d*),(.*),(\d*.\d*)\r*\n/ && $7 >= $threshold) { #gates applied here can use gt > lt < or equal to == symbols my $newline = "$3,$1,$2,$4,$5,$6,$7,$8,"; print OUT $newline."P-S780\+ nonG1\n" } elsif (/^\d{1,4},(\d{1,4}),Red,Blue,Green,(\d{1,2}),([A-H][0-9]+),[0-9]+,[A-H],(\d{1,4}),(\d*.\d*),(\d*.\d*),(.*),(\d*.\d*)\r*\n/ && $8 >= $threshold2) { #gates applied here can use gt > lt < or equal to == symbols my $newline = "$3,$1,$2,$4,$5,$6,$7,$8,"; print OUT $newline."P-S780\- G1\n"} else { if (/^\d{1,4},(\d{1,4}),Red,Blue,Green,(\d{1,2}),([A-H][0-9]+),[0-9]+,[A-H],(\d{1,4}),(\d*.\d*),(\d*.\d*),(.*),(\d*.\d*)\r*\n/){ my $newline = "$3,$1,$2,$4,$5,$6,$7,$8,"; print OUT $newline."P-S780\- nonG1\n" } } } close IN; close OUT; #All Perl scripts supplied with this manuscript use 'regular expression' pattern matching lines such as those on lines 75, 79, 83, 87 above. #A google search for 'regex' will yield more general information about editing these lines. Below is a brief summary of the use of regex in the scripts provided with this manuscript. #These regex lines are symbolic templates telling the script to ignore lines of data unless they fit the supplied pattern. When a given line matches, the script then uses the data to apply gates, apply appropriate labels and includes selected data from the line in a newly created file. #If the ExportToSpreadsheet module settings in Cell Profiler are changed, the regular expression lines need to be changed to match the resulting shift in position/number of the data columns. #regex example: /^\d{1,4},(\d{1,4}),Red,Blue,Green,(\d{1,2}),([A-H][0-9]+),[0-9]+,[A-H],(\d{1,4}),(\d*.\d*),(\d*.\d*),(.*),(\d*.\d*)\r*\n/ #Between the two slash / / symbols are comma-separated value definitions which should pattern-match the row-by-row cell data output from Cell Profiler when using 3_channels_pipeline.cppipe. The pattern used here is intended for the file Nuclei.csv #Between the terminal slashes, the beginning to each line is represented by ^ and the end of the line by the symbols \r*\n #Each column of data is separated by comma symbols. #Commas and text are matched as they appear, but "\d" specifies 'a decimal character', "." specifies any character except for 'whitespace' characters, values between {} or [] define ranges and + and * are quantifiers for "1-or-more" or "0-to-any", respectively. Combinations of these symbols allow for integers \d* and floating point values \d*.\d* for example. #The end of the regex line "\r*\n" indicates the position of invisible carriage return and line-feed symbols appropriate to both Mac and PC .csv and .txt formats. #Parentheses () indicate those data used within the perl script for calculations (see below). Each time a line matches the regex pattern, the values wrapped in () will be assigned a '$' number, starting from $1 on the left and incrementally increasing through $2, $3, etc. These temporary memory values are then used for calculations and the building of the next line of data to be appended to the growing output file. These values reset for each time the pattern matching loop reads a new line from the input data file. Values not between () symbols are effectively discarded, however, the original file is not modified by this program. #The four regex lines used in this script are identical, but repeat to iteratively test values in positions 7 and 8 ($7 and $8, respectively) against the four logical ways of applying the two gate values ($threshold and $threshold2). The other two Perl scripts 'antibody_fluorescence_summary.pl' and 'G1assay_summary.pl' also use a regex line, but as they only use one gate, they only use one regex line - the same general notes apply to those scripts. #Repositioning the '()' symbols on the regex line, and/or changing the '$' references will allow different measurements to be used by the script. #As an example this script, as published, will use the 'mean nuclear GFP intensity' rather than the ratio of nuc/cyt if $7 is replaced with $6. #This would be useful if a second, nuclear-limited, antibody intensity were being multuiplexed with the P-S780 RB1 ab ($8) for instance.