#!/usr/bin/perl use strict; use warnings; # Notice - this is a custom Perl script as described in the 2016 Nature Genetics paper # "A high-quality carrot genome assembly provides new insights into carotenoid # accumulation and Asterid genome evolution" # This program was referenced by this line: # ... "Top 1% of FST values were determined and visualized by a custom Perl script." ... # Please be aware that this is not a general purpose program, it is highly specific to # our analysis, and is provided for informative purposes only. Note that this program # is a helper program for 34.fstplots.combined.bis.sh # change x coordinates to merge all 9 chromosomes # configuration variables my ( $pfx ) = $0 =~ m/\/(\d+)[^\/]*$/; my $infilename = $ARGV[0]; my $outfilename = $ARGV[1]; my $binsize = $ARGV[2]; my $remark = $ARGV[3]; #my @chrlen = (0 51478430 43922073 50330249 35931953 41970082 36619998 36387505 31755805 33754894); my %chrstart = ( 'CHR1' => 0, 'CHR2' => 51478430, 'CHR3' => 95400503, 'CHR4' => 145730752, 'CHR5' => 181662705, 'CHR6' => 223632787, 'CHR7' => 260252785, 'CHR8' => 296640290, 'CHR9' => 328396095, 'end' => 362150989 ); my $printntopbins = 12; # global variables my @binsa; # just a count my @binsb; # sum of values my $nlines = 0; my $nsaved = 0; my $INF = stdopen ( "<", $infilename ); while ( my $aline = <$INF> ) { $nlines++; next unless ( $nlines > 1 ); $aline =~ s/[\r\n]//g; my @cols = split ( /\t/, $aline ); # stop when we hit PT next unless ( defined $chrstart{$cols[0]} ); my $x = $cols[1] + $chrstart{$cols[0]}; unless ( $cols[2] =~ m/nan/ ) { my $bin = int($x / $binsize); $binsa[$bin]++; $binsb[$bin] += $cols[2]; $nsaved++; } } # while <$INF> close ( $INF ); print commify($nlines), " lines processed, ".commify($nsaved)." lines saved ".timestr()."\n"; my $ymaxa = 0; my $ymaxb = 0; my %reverseb; my $OUTF = stdopen ( ">", $outfilename ); print $OUTF join ( "\t", 0, 0, 0, "CHR", "Position" ), "\n"; for my $i ( 0 .. $#binsa ) { my $ya = ( $binsa[$i] // 0 ); my $yb = ( $binsb[$i] // 0 ); push ( @{$reverseb{$yb}}, [ $i, $ya ] ); my ( $c, $x ) = convertback ( $i * $binsize ); print $OUTF join ( "\t", $i*$binsize, $ya, $yb, $c, $x-($binsize/2) ), "\n"; #for bars: print $OUTF join ( "\t", ($i+1)*$binsize-1, $ya, $yb, $c, $x+$binsize-1 ), "\n"; if ( $ya > $ymaxa ) { $ymaxa = $ya; } if ( $yb > $ymaxb ) { $ymaxb = $yb; } } print $OUTF join ( "\t", $#binsa*$binsize+1, 0, 0, "", "" ), "\n"; stdclose ( $OUTF ); print "$remark Y maxima were $ymaxa, $ymaxb\n"; # print top few highest bins my @sortedkeys = sort ( {$b<=>$a} keys %reverseb ); my $nprt = 0; my $nprtchr5 = 0; my $i = 0; print join ( "\t", "Rank", "Count", "fstSum", "CHR", "Position" ), "\n"; while ( $nprtchr5 < $printntopbins ) { foreach my $valueref ( @{$reverseb{$sortedkeys[$i]}} ) { my $value = $valueref->[0]; my $ya = $valueref->[1]; my ( $c, $x ) = convertback ( $value * $binsize ); $nprt++; my $flag = ''; if ( $c eq 'CHR5' ) { $nprtchr5++; $flag = $nprtchr5; if ( ( $x <= 24618603 ) and ( ($x+$binsize) >= 24618603 ) ) { $flag .= "=HIT"; } } print join ( "\t", $nprt, $ya, sprintf("%0.3f",$sortedkeys[$i]), $c, $x, $flag ), "\n"; } $i++; } system ( "cp -puv $0 notebook/" ); print "$0 Done ".timestr()."\n"; exit 0; ############################################################### sub convertback { my ( $x ) = @_; ############################################################### my $start = 0; my $chr = 'CHR1'; foreach my $achr ( sort keys %chrstart ) { if ( $x >= $chrstart{$achr} ) { $start = $chrstart{$achr}; $chr=$achr; } } $x -= $start; return ( $chr, $x ); } # sub convertback ############################################################### sub commify ############################################################### # http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas { local $_ = shift; 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; return $_; } # commify ############################################################### sub timestr ############################################################### { @_ = localtime(shift || time); return(sprintf("%04d/%02d/%02d %02d:%02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1,0])); } # sub timestr ############################################################### sub stdopen { my ( $mode, $filename, $extratext ) = @_; ############################################################### # a replacement for the three-parameter open which also allows # the use of "-" as the file name to mean STDIN or STDOUT my $fh; # the file handle if ( $filename eq "-" ) # only exact match to "-" has special meaning { if ( $mode =~ m/>/ ) { $fh = *STDOUT } else { $fh = *STDIN } } else { # supplemental passed text for error messages, need one more space if ( defined $extratext ) { $extratext .= " " } else { $extratext = "" } my $text; # this is only used for error message if ( $mode =~ m/^\+?>>/ ) # ">>" or "+>>" { $text = "append" } elsif ( $mode =~ m/^\+?>/ ) # ">" or "+>" { $text = "output" } elsif ( $mode =~ m/^\+?$/ ) # output mode { $mode = "|-"; $filename = "gzip -c > \"$filename\""; } elsif ( $mode =~ m/^<$/ ) # input mode { $mode = "-|"; $filename = "gunzip -c \"$filename\""; } else { die "Error, can't handle gzip compression with mode \"$mode\" for file \"filename\"\n"; } } # if gzip compressed file elsif ( $filename =~ m/\.bz2$/ ) { if ( $mode =~ m/^>$/ ) # output mode { $mode = "|-"; $filename = "bzip2 -c > \"$filename\""; } elsif ( $mode =~ m/^<$/ ) # input mode { $mode = "-|"; $filename = "bunzip2 -c \"$filename\""; } elsif ( $mode =~ m/^>>$/ ) # append mode { $mode = "|-"; $filename = "gzip -c >> \"$filename\""; } else { die "Error, can't handle bzip2 compression with mode \"$mode\" for file \"filename\"\n"; } } open ( $fh, $mode, $filename ) or die ( "Error opening ${extratext}file \"$filename\" for $text: $!\n" ); } # return the opened file handle to the caller return $fh; } # sub stdopen ############################################################### sub stdclose { my ( $fh ) = @_; ############################################################### # same as built-in close, except in case of STDIN or STDOUT, # and in this case the file handle is not closed unless ( fileno($fh) <= 2 ) # if file number is this low, is stdin or stdout or stderr { close ( $fh ) or die ( "Error closing file handle: $!\n" ); } } # sub stdclose #eof