#!/usr/bin/perl use strict; use warnings; # Notice - this is a custom Perl script as described in the 2016 Nature Genetics paper # "A high-quality carrot genome assembly provides new insights into carotenoid # accumulation and Asterid genome evolution" # This program was referenced by this line: # ... "Scaffold connections supported by at least two PE BACs were annotated and sequences # were further connected using a custom Perl program." ... # Please be aware that this is not a general purpose program, it is highly specific to # our analysis, and is provided for informative purposes only. # configuration variables my ( $pfx ) = $0 =~ m/\/(\d+)[^\/]*$/; my $blasthits = "/illumina/carrot/bgi/20130304/scripts/46.DHbac.blast.tsv"; my $newbacseq = "50.DHbac.fna"; my $outtablename = "$pfx.BacEndConn.tsv"; my $remoteoutgffname = "/home/dsenalik/mnt/share/gbrowse/bgi20130304ann/$pfx.BacEndConn.gff3"; my $localoutgffname = "$pfx.BacEndConn.gff3"; my $minpctsim = 96; my $minpctlen = 90; my $maxhitnum = 1; my $maxtothits = 2; my $maxbacsize = 300000; # global variables my %blasthits; my %pairedhits; ##### Parse blast file and store filtered good hits in %blasthits { print "Parsing blast file \"$blasthits\"\n"; my $nlines = 0; open my $INF,"<",$blasthits or die "Error opening input file \"$blasthits\": $!\n"; while ( my $aline = <$INF> ) { $nlines++; $aline =~ s/[\r\n]//g; my @cols = split ( /\t/, $aline ); # blast output columns: # [0]qseqid [1]sseqid [2]pident [3]length [4]mismatch [5]gapopen [6]qstart [7]qend # [8]sstart [9]send [10]evalue [11]bitscore # optional [12]querylen [13]subjectlen # filtering next if ( $cols[2] < $minpctsim ); next if ( ( $cols[3] * 100 / $cols[12] ) < $minpctlen ); # store good hits my ( $key, $end ) = split ( /\./, shift ( @cols ) ); push ( @{$blasthits{$key}->{$end}}, \@cols ); #print "$nlines ".join ( "\t", $key, $end, @cols )."\n"; #@@@ #last if ( $nlines > 100000 ); #@@@ } # while <$INF> close $INF; print commify($nlines), " lines processed, stored ".commify(scalar keys %blasthits)." queries ".timestr()."\n"; } ##### Find pairs in different scaffolds, filtering for number of other hits { foreach my $querykey ( keys %blasthits ) { # must be a hit for each end if ( ( scalar keys %{$blasthits{$querykey}} ) > 1 ) { # remove if too many other hits my $nfends = scalar @{$blasthits{$querykey}->{f}}; my $nrends = scalar @{$blasthits{$querykey}->{r}}; next if ( $nfends > $maxtothits ); next if ( $nrends > $maxtothits ); for my $i ( 1..$nfends ) { next if ( $i > $maxhitnum ); for my $j ( 1..$nrends ) { next if ( $j > $maxhitnum ); # [0]sseqid [1]pident [2]length [3]mismatch [4]gapopen [5]qstart [6]qend # [7]sstart [8]send [9]evalue [10]bitscore [11]querylen [12]subjectlen # remove if both hits in same subject my $hitkey1 = $blasthits{$querykey}->{f}->[$i-1]->[0]; my $hitkey2 = $blasthits{$querykey}->{r}->[$j-1]->[0]; next if ( $hitkey1 eq $hitkey2 ); # store, and store as reciprocal push ( @{$pairedhits{$hitkey1}->{$hitkey2}}, [ $querykey, \@{$blasthits{$querykey}->{f}->[$i-1]}, \@{$blasthits{$querykey}->{r}->[$i-1]} ] ); push ( @{$pairedhits{$hitkey2}->{$hitkey1}}, [ $querykey, \@{$blasthits{$querykey}->{f}->[$i-1]}, \@{$blasthits{$querykey}->{r}->[$i-1]} ] ); } # for $j } # for $i } # if hit for each end } # foreach my $querykey ( keys %blasthits ) } ##### save table { my $nsaved = 0; my $nge2 = 0; open my $OUTG,">",$localoutgffname or die "Error opening output file \"$localoutgffname\": $!\n"; open my $OUTF,">",$outtablename or die "Error opening output file \"$outtablename\": $!\n"; # output file headers print $OUTF join ( "\t", "Seq1", "Seq2", "#Connections", "Total Distance", "F Subject", "F Identity", "F Start", "F End", "F Length", "R Subject", "R Identity", "R Start", "R End", "R Length" ), "\n"; print $OUTG "##gff-version 3\n"; print $OUTG "##Index-subfeatures 1\n"; print $OUTG "\n"; foreach my $key1 ( sort keys %pairedhits ) { my $key1index = 0; foreach my $key2 ( sort keys %{$pairedhits{$key1}} ) { my $nlinks = scalar ( @{$pairedhits{$key1}->{$key2}} ); print $OUTF join ( "\t", $key1, $key2, $nlinks ), "\n"; for my $i ( 1..$nlinks ) { # my $distf = min ( ( $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[12] - $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[8] ), # $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[8] ); # my $distr = min ( ( $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[12] - $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[8] ), # $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[8] ); my $distf; my $distr; my @dir; if ( $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[7] < $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[8] ) { $distf = $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[12] - $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[8]; $dir[1] = "+"; } else { $distf = $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[8]; $dir[1] = "-"; } if ( $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[7] < $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[8] ) { $distr = $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[12] - $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[8]; $dir[2] = "+"; } else { $distr = $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[8]; $dir[2] = "-"; } my $totdist = $distf + $distr; if ( $totdist > $maxbacsize ) { $totdist = '*' . $totdist; } print $OUTF join ( "\t", $key1, $key2, "", $pairedhits{$key1}->{$key2}->[$i-1]->[0], $totdist, $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[0], $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[1], $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[7], $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[8], $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[12], $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[0], $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[1], $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[7], $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[8], $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[12] ), "\n"; unless ( $totdist =~ m/\*/ ) { $key1index++; # the two hits may be in different order (flipped) than $key1 $key2 order my $which; my $otherwhich; if ( $pairedhits{$key1}->{$key2}->[$i-1]->[1]->[0] eq $key1 ) { $which = 1; $otherwhich = 2; } elsif ( $pairedhits{$key1}->{$key2}->[$i-1]->[2]->[0] eq $key1 ) { $which = 2; $otherwhich = 1; } else { die "Program bug, neither \"$pairedhits{$key1}->{$key2}->[$i-1]->[1]->[0]\" nor \"$pairedhits{$key1}->{$key2}->[$i-1]->[2]->[0]\" matches \"$key1\"\n"; } my $hitstart = min ( $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[7], $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[8] ); my $hitend = max ( $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[7], $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[8] ); my $parentstart = 1; my $parentend = $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[12]; my $markerend = 2; my $markerstart = $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[12] - 1; if ( $dir[$which] eq "+" ) { $parentstart = $hitstart; $markerend = $parentend; } else { $parentend = $hitend; $markerstart = $parentstart; } print $OUTG join ( "\t", $key1, "blastn", "BacEndConn", $parentstart, $parentend, $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[1], $dir[$which], ".", "ID=".$key1.".conn.".$key1index . ";Name=BL\%3A".$key2 . ";Pair=$which" . ";Note=Distance\%3D".$totdist . " BAC\%3D$pairedhits{$key1}->{$key2}->[$i-1]->[0]" . " Destination\%3D$key2 at $pairedhits{$key1}->{$key2}->[$i-1]->[$otherwhich]->[7]..$pairedhits{$key1}->{$key2}->[$i-1]->[$otherwhich]->[8]" ), "\n"; print $OUTG join ( "\t", $key1, "blastn", "BacEndConn_part", $hitstart, $hitend, $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[1], $dir[$which], ".", "ID=".$key1.".conn.".$key1index.".1" . ";Name=BL\%3A".$key2 . ";Parent=".$key1.".conn.".$key1index . ";Pair=$which" ), "\n"; print $OUTG join ( "\t", $key1, "blastn", "BacEndConn_part", $markerstart, $markerend, $pairedhits{$key1}->{$key2}->[$i-1]->[$which]->[1], $dir[$which], ".", "ID=".$key1.".conn.".$key1index . ".2" . ";Name=BL\%3A".$key2 . ";Parent=".$key1.".conn.".$key1index . ";Pair=3" ), "\n"; # Reciprocal connection will be done when we see the reciprocal hit } } $nsaved++; if ( $nlinks >= 2 ) { $nge2++; } } } close $OUTF; close $OUTG; print ( "Saved ".commify($nsaved)." unique connections, ".commify($nge2)." had 2 or more BACs\n" ); } system ( "cp -puv $0 notebook/" ); system ( "cp -puv $localoutgffname $remoteoutgffname/" ); system ( "cp -puv $outtablename notebook/" ); print "Done ".timestr()."\n"; exit 0; ############################################################### sub commify ############################################################### # http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas { local $_ = shift; 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; return $_; } # commify ############################################################### sub timestr ############################################################### { @_ = localtime(shift || time); return(sprintf("%04d/%02d/%02d %02d:%02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1,0])); } # sub timestr ############################################################ sub min { ############################################################ # http://www.perlmonks.org/?node_id=648321 my $min = shift; foreach (@_) { if ( $_ ) # special modification to ignore zero values { if ( $_ < $min ) { $min = $_; } } } return $min; } # sub min ############################################################ sub max { ############################################################ my $max = shift; foreach (@_) { if ( $_ > $max ) { $max = $_; } } return $max; } # sub max #eof