Thesis Functions
Perl functions used in thesis scripts.
1 # USAGE IN SCRIPTS: 2 # require "functions.pl"; 3 # use vars qw( @aas @sorted_aas @aas_and_gaps ); 4 # 5 # A utility functions file 6 7 use warnings; 8 use strict; 9 use Math::Complex; 10 11 # all the aas included in an EMBOSS matrix, and ambiguity codes 12 my @emboss_aas = ( 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', "B", "Z", "X", "*" ); 13 14 # excludes B, Z, X, and * 15 my @aas = ( 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V' ); 16 17 my @sorted_aas = sort( @aas ); 18 19 # includes a gap character 20 my @aa_and_gaps = ( 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '-'); 21 22 my $embossdata = "/Users/ajohnson/familybuild/scripts/.embossdata/"; 23 24 ################################################################################ 25 # UTILITY 26 ################################################################################ 27 28 # trim the whitespace off of the front and end of a string 29 sub trim 30 { 31 chomp( @_ ); 32 map { /^\s*(\S.*\S)\s*$/ } @_; 33 } 34 35 # given a tab delimited line, returns an array 36 sub make_fields 37 { 38 my $line = shift( @_ ); 39 chomp( $line ); 40 my @line_fields = split( /\t/, $line ); 41 return @line_fields; 42 } 43 44 # returns a hash reference col name -> value 45 sub get_fields 46 { 47 my $line = shift( @_ ); 48 chomp( $line ); 49 my @line_fields = @_; 50 51 my @fields = split( /\t/, $line ); 52 my %fields; 53 54 foreach my $field_name (@line_fields) 55 { 56 $fields{$field_name} = shift( @fields ); 57 #print "$field_name = ".$fields{$field_name} . "\n"; 58 } 59 60 return \%fields; 61 } 62 63 # makes a line for a tab delineated file 64 # from a hash in the given order of fields 65 sub make_line 66 { 67 my $hash = shift( @_ ); 68 my @fields = @_; 69 my @line; 70 71 foreach my $field ( @fields ) 72 { 73 push( @line, $hash->{$field} ); 74 } 75 76 return join( "\t", @line ); 77 } 78 79 # Prints out each key-value pair of a hash reference 80 sub print_hash 81 { 82 my $hash_ref = shift( @_ ); 83 84 while ( my ($key, $value) = each(%$hash_ref) ) { 85 print "\t'$key' => $value\n"; 86 } 87 } 88 89 # parses a settings file and returns a hash of setting value pairs 90 sub get_settings 91 { 92 my %settings; 93 94 foreach my $file ( @_ ) 95 { 96 open( SETTINGS, $file ) || die( "Could not open settings file $file!\n"); 97 98 while( my $line = <SETTINGS> ) 99 { 100 &trim( $line ); 101 102 #don't include comments, sections, or empty lines 103 next if( substr( $line, 0, 1) eq "#" || substr( $line, 0, 1) eq "=" || !$line ); 104 105 my @fields = split( "\t", $line ); 106 my $pref_name = shift( @fields ); 107 108 if( scalar( @fields ) > 1 ) 109 { 110 $settings{$pref_name} = \@fields; 111 } 112 elsif( scalar( @fields ) == 1 ) 113 { 114 $settings{$pref_name} = shift( @fields ); 115 } 116 } 117 } 118 119 return \%settings; 120 } 121 122 # Checks that a given settings file contains all the settings 123 # in the given hash from that settings file 124 sub check_settings 125 { 126 my $settings_file = shift( @_ ); 127 my $settings = shift( @_ ); 128 129 foreach my $setting( @_ ) 130 { 131 if( !defined( $settings->{$setting} ) || !$settings->{$setting} ) 132 { 133 die "$setting is not in $settings_file!\n"; 134 } 135 } 136 } 137 138 sub print_settings 139 { 140 my $title = shift( @_ ); 141 my $settings = shift( @_ ); 142 143 print "$title:\n\n"; 144 145 foreach my $setting ( @_ ) 146 { 147 my $value = $settings->{$setting}; 148 149 if( ref($value) eq "ARRAY" ) 150 { 151 print "\t$setting\t=\t" . join( "\t", @$value ) . "\n"; 152 } 153 else 154 { 155 print "\t$setting\t=\t$value\n"; 156 } 157 } 158 159 print "\n"; 160 } 161 162 # reads in an entire file at once 163 sub slurp 164 { 165 my $file = shift( @_ ); 166 167 my $old_terminator = $/; 168 undef $/; 169 open( SLURP, $file ) || warn "Could not open file $file for slurp!\n"; 170 my $slurp = <SLURP>; 171 close( SLURP ); 172 $/ = $old_terminator; 173 174 return $slurp; 175 } 176 177 # will submit jobs for all families where the last 178 # arg to the script being run here is the family folder 179 # 180 # Runs in SGE parallel if load is 0 181 # Runs linearly if load is -1 182 # Runs multiple jobs at once and keeps the load under the 183 # given number otherwise 184 sub submit_family_jobs 185 { 186 my $maindir = shift( @_ ); 187 my $script = shift( @_ ); 188 my $sgename = shift( @_ ); 189 my $sgedir = shift( @_ ); 190 my $load = shift( @_ ); 191 my $wait = 4; 192 my @args = @_; 193 194 print "Running jobs for $maindir\n"; 195 print "\tScript: $script\n"; 196 print "\tSGEName: $sgename\n"; 197 print "\tSGEDir: $sgedir\n"; 198 print "\tLoad: $load\n"; 199 print "\tArgs: " . join( ", ", @args ) . "\n"; 200 201 if( ! -e $sgedir ) 202 { 203 print "SGEDIR: '$sgedir' does not exist; creating\n\n"; 204 mkpath( $sgedir ); 205 } 206 207 my @commands; 208 my $parallel = 0; 209 210 if( $load == 0 ) 211 { 212 print "Load is 0, running in parallel...\n\n"; 213 $parallel = 1; 214 } 215 216 if( $parallel || $load < 0 ) 217 { 218 $wait = 1; 219 } 220 221 my @folders = <$maindir/*.fasta.folder>; 222 223 my $submit_index = 0; 224 my $job_total = scalar(@folders); 225 226 # run jobs for all folders, in alphabetical order 227 foreach my $folder (sort(@folders)) 228 { 229 $folder =~ /([^\/]+)\.fasta\.folder$/; 230 my $name = $1; 231 232 $submit_index++; 233 234 my $submit_command = "perl $script " . 235 join( " ", @args ) . " $folder"; 236 237 # submit job through qsub 238 if( $parallel ) 239 { 240 print "Performing job $submit_index/$job_total\n"; 241 $submit_command = "qsub -cwd -b y -o $sgedir -e $sgedir -N $sgename\_$name $submit_command"; 242 print "\n$submit_command\n\n"; 243 system( $submit_command ); 244 &check_num_jobs($wait); 245 } 246 # add to the linear queue 247 else 248 { 249 push( @commands, $submit_command ); 250 } 251 } 252 253 # run multiple jobs at once on a single computer 254 if( ! $parallel && $load > 0 ) 255 { 256 &job_processor( $load, $wait, @commands ); 257 } 258 # run one job at a time 259 else 260 { 261 my $run_index = 0; 262 foreach my $command ( @commands ) 263 { 264 print "Running $run_index/$job_total: $command\n"; 265 system( $command ); 266 $run_index++; 267 sleep( 1 ); 268 } 269 } 270 } 271 272 # returns when the number of jobs running in qstat 273 # is less than 80 274 sub check_num_jobs 275 { 276 my( $wait ) = @_; 277 278 sleep($wait); 279 280 # give 20 as leeway 281 my $max_num_jobs = 80; 282 my $count = &count_qstat_jobs(); 283 284 print "Jobs still running: $count\n"; 285 286 while( $count >= $max_num_jobs ) 287 { 288 sleep($wait+4); 289 $count = &count_qstat_jobs(); 290 } 291 } 292 293 # Counts the current qstat jobs running 294 # for the user 295 sub count_qstat_jobs 296 { 297 my $user = $ENV{"USER"}; 298 my $qstat = `qstat -u $user`; 299 my @lines = split("\n",$qstat); 300 @lines = grep { /$user/ } @lines; 301 my $count = scalar(@lines); 302 return $count; 303 } 304 305 # runs jobs on a single machine 306 sub job_processor 307 { 308 my( $max_load, $wait, @commands ) = @_; 309 # helps identify which jobs are coming from the same processor 310 my $random_id = int(rand(10000)); 311 my $n_jobs = 0; 312 # number of seconds to wait when overload 313 my $load_wait = 45; 314 315 for my $command ( @commands ) 316 { 317 my $load = &get_load(1); 318 319 # check the current load and wait if it's too much 320 while( $load > $max_load ) 321 { 322 print "JP $random_id (job $n_jobs): Load $load above $max_load; waiting $load_wait seconds\n"; 323 sleep $load_wait; 324 $load = &get_load(1); 325 } 326 327 print "JP $random_id (job $n_jobs): Starting: $command (load: $load)\n"; 328 print `date`; 329 330 # don't want zombie child processes! 331 $SIG{CHLD} = 'IGNORE'; 332 my $pid = fork(); 333 $n_jobs++; 334 335 if( $pid == 0 ) 336 { 337 exec( $command ); 338 } 339 340 print "JP $random_id (job $n_jobs): Waiting $wait seconds before submitting next command\n"; 341 342 sleep $wait; 343 } 344 } 345 346 # parses the uptime command to get a good 347 # idea of the current load 348 sub get_load 349 { 350 my $slot = shift( @_ ); 351 352 my $uptime = `uptime`; 353 354 #print "Getting slot $slot for uptime: $uptime\n"; 355 356 if( $uptime !~ /([0-9.]+), ([0-9.]+), ([0-9.]+)$/ ) 357 { 358 print "Could not parse uptime string: $uptime\n"; 359 return undef; 360 } 361 362 #print "$1, $2, $3\n"; 363 364 if( $slot == 1 ) 365 { 366 return $1; 367 } 368 elsif( $slot == 5 ) 369 { 370 return $2; 371 } 372 elsif( $slot == 15 ) 373 { 374 return $3; 375 } 376 else 377 { 378 return $1; 379 } 380 } 381 382 # http://www.perl.com/doc/FAQs/FAQ/oldfaq-html/Q4.13.html 383 sub round { 384 return int($_[0] + .5 * ($_[0] <=> 0)); 385 } 386 387 ################################################################################ 388 # SEQUENCE FIDDLING 389 ################################################################################ 390 391 # unwrap all a fasta record's newlines and make the format 392 # HEADER\tSEQUENCE 393 sub fasta_unwrap { 394 my $text = shift( @_ ); 395 396 $text =~ s/\n/\t/; 397 $text =~ s/\>//; 398 $text =~ s/\n//g; 399 400 return 0 if !$text; 401 402 my @fields = split( "\t", $text ); 403 my %return = ( "id" => $fields[0], "seq" => $fields[1] ); 404 return \%return; 405 } 406 407 # will take a string that has multiple fasta sequences in it 408 # and return an ID->sequence hash of the contents 409 sub fasta_split { 410 411 my %hash; 412 413 foreach my $sequences ( @_ ) 414 { 415 my @sequences = split(/\>/, $sequences); 416 &trim( @sequences ); 417 418 foreach my $sequence ( @sequences ) 419 { 420 next if( !$sequence ); 421 my @lines = split( /\n/, $sequence ); 422 &trim( @lines ); 423 my $id = shift( @lines ); 424 $hash{$id} = join( "", @lines ); 425 } 426 } 427 428 return \%hash; 429 } 430 431 # figures out the first position in an alignment 432 # where all sequences have started 433 sub align_start { 434 435 my ( $alignments, $length ) = shift( @_ ); 436 437 if( !$length ) 438 { 439 my @ids = keys(%$alignments); 440 $length = length( $alignments->{ $ids[0] } ); 441 } 442 443 my $start = 0; 444 445 # find the start of the complete alignment 446 for my $location ( 0 .. $length ) 447 { 448 # keeps track of whether all the sequences 449 # have a base at this position 450 my $all = 1; 451 452 while ( my ($id, $align) = each(%$alignments) ) 453 { 454 $all = 0 if ( substr( $align, $location, 1 ) eq "-" ); 455 } 456 457 if( $all ) 458 { 459 $start = $location; 460 last; 461 } 462 } 463 464 return $start+1; 465 } 466 467 # figures out the place in the alignment where the 468 # shortest sequence ends 469 sub align_end { 470 471 my ($alignments, $length) = shift( @_ ); 472 473 if( !$length ) 474 { 475 my @ids = keys(%$alignments); 476 $length = length( $alignments->{ $ids[0] } ); 477 } 478 479 my $offset = 0; 480 481 # find the end of the complete alignment 482 for my $location ( 1 .. $length ) 483 { 484 # keeps track of whether all the sequences 485 # have a base at this position 486 my $all = 1; 487 488 while( my ($id, $align) = each(%$alignments) ) 489 { 490 $all = 0 if( substr( $align, -$location, 1 ) eq "-" ); 491 #print( $location . ":\t" . substr( $align, -$location, 1 ) . "\n" ); 492 } 493 494 #print( "=====\n" ); 495 496 if( $all ) 497 { 498 $offset = $location; 499 last; 500 } 501 } 502 503 my $end = $length - $offset + 1; 504 505 #print( "END: $end\n" ); 506 507 return $end; 508 } 509 510 # will take in a ID->alignment hashreference and return 511 # a hash where the alignments have been trimmed so that 512 # the trailing beginnings and ends are gone 513 sub align_trim { 514 515 my $alignments = shift( @_ ); 516 my ( $length, $start, $end ) = shift( @_ ); 517 518 if( !$length ) 519 { 520 my @ids = keys(%$alignments); 521 $length = length( $alignments->{ $ids[0] } ); 522 } 523 524 if( !$start ) 525 { 526 # the number of positions from the beginning 527 $start = &align_start( $alignments ); 528 } 529 530 if( !$end ) 531 { 532 # the number of positions from the end 533 $end = &align_end( $alignments ); 534 } 535 536 my %trimmed; 537 538 #print "Length: $length ($ids[0], $has)\n"; 539 540 # trim all sequences 541 while( my ($id, $align) = each( %$alignments)) 542 { 543 $trimmed{$id} = substr( $align, $start-1, $end - $start + 1 ); 544 } 545 546 return \%trimmed; 547 548 549 #things stay the same 550 if( $start == 0 && $end == $length ) 551 { 552 # trim all sequences 553 while( my ($id, $align) = each( %$alignments)) 554 { 555 $trimmed{$id} = $alignments->{$id}; 556 } 557 } 558 #no end for substring 559 elsif( $end == $length ) 560 { 561 # trim all sequences 562 while( my ($id, $align) = each( %$alignments)) 563 { 564 $trimmed{$id} = substr( $align, $start ); 565 } 566 } 567 else 568 { 569 # trim all sequences 570 while( my ($id, $align) = each( %$alignments)) 571 { 572 $trimmed{$id} = substr( $align, $start-1, -($end-1) ); 573 } 574 } 575 576 return \%trimmed; 577 } 578 579 # will take in a hash of alignments and a substitution matrix 580 # and return a hash containing the evaluation of the alignment 581 # right now only does two alignments at once! 582 # NOTE: THESE SHOULD BE TRIMMED ALIGNMENTS 583 sub evaluate_alignment 584 { 585 # hash of alignments by ID->alignment 586 my $alignments = shift( @_ ); 587 # substitution matrix hash for scoring 588 # if not present, no score will be present 589 my $matrix = shift( @_ ); 590 my $gap_penalty = shift( @_ ); 591 my $extend_penalty = shift( @_ ); 592 my @ids = @_; 593 594 my $id1 = $ids[0]; 595 my $id2 = $ids[1]; 596 597 #print "Evaluating $id1 vs $id2\n"; 598 599 my $align1 = $alignments->{$ids[0]}; 600 my $align2 = $alignments->{$ids[1]}; 601 602 #print "seq1: $seq1\nseq2: $seq2\n"; 603 604 my $start = 1; 605 my $end = length($align1); 606 607 #print "Start: $start\nEnd: $end\n"; 608 609 my $in_gap = 0; 610 my $gap_count = 0; 611 my $num_gaps = 0; 612 my $num_gap_pos = 0; 613 my $diff_positions = 0; 614 my $total_positions = 0; 615 my $similarity_positions = 0; 616 my $match_positions = 0; 617 my $score = 0; 618 619 my %transitions; 620 621 #print "Adding $id1 versus $id2 to family matrix\n"; 622 623 for my $location ( $start .. $end ) 624 { 625 $location--; 626 627 my $aa1 = substr( $align1, $location, 1 ); 628 my $aa2 = substr( $align2, $location, 1 ); 629 630 #print "$aa1 vs $aa2: "; 631 632 if( $aa1 eq "-" || $aa2 eq "-" ) 633 { 634 $in_gap = 1; 635 $gap_count++; 636 #print "\n"; 637 } 638 else 639 { 640 if( $in_gap ) 641 { 642 $score -= $gap_penalty + $extend_penalty * $gap_count - $extend_penalty; 643 $num_gap_pos += $gap_count; 644 $num_gaps++; 645 $gap_count = 0; 646 $in_gap = 0; 647 #print "GAPS = $num_gaps\n"; 648 #print "GAP COUNT = $num_gap_pos\n" 649 } 650 651 if( $aa1 le $aa2 ) 652 { 653 $transitions{$aa1}{$aa2}++; 654 } 655 else 656 { 657 $transitions{$aa2}{$aa1}++; 658 } 659 660 if( !defined($matrix->{$aa1}{$aa2}) ) 661 { 662 die "ERROR TRYING TO ACCESS SUBSTITUTION MATRIX AT $aa1:$aa2\n"; 663 } 664 else 665 { 666 #print "SCORE $score + $matrix->{$aa1}{$aa2} ($aa1:$aa2)\n"; 667 668 #print "$matrix->{$aa1}{$aa2}\n"; 669 670 $score += $matrix->{$aa1}{$aa2}; 671 672 if( $matrix->{$aa1}{$aa2} > 0 ) 673 { 674 #print "\tSIMILARITY POSITION: $aa2 $aa2 = $matrix->{$aa1}{$aa2}\n"; 675 $similarity_positions++; 676 } 677 } 678 679 if( $aa1 ne $aa2 ) 680 { 681 $diff_positions++; 682 } 683 else 684 { 685 $match_positions++; 686 } 687 688 $total_positions++; 689 } 690 } 691 692 my $total_length = $total_positions + $num_gap_pos; 693 #my $percent_gaps = $num_gap_pos / $total_length; 694 my $percent_identity = eval{ $match_positions / $total_length }; 695 696 $percent_identity = 0 if( !$percent_identity ); 697 #my $percent_similarity = $similarity_positions / $total_length; 698 699 my %fields = ( 700 "id1" => $id1, 701 "id2" => $id2, 702 "length" => $total_length, 703 "score" => $score, 704 "pi" => $percent_identity, 705 "ident_positions" => $match_positions, 706 "similar_positions" => $similarity_positions, 707 "num_gaps" => $num_gaps, 708 "num_gap_positions" => $num_gap_pos, 709 "align1" => $align1, 710 "align2" => $align2 711 ); 712 713 return \%fields; 714 } 715 716 # Given a sequence and the its start and ending sections 717 # reconstruct the squence as it appeared in its alignment 718 sub align_construct { 719 720 my ( $sequence, $sections ) = @_; 721 my @sections = split( ";", $sections ); 722 my $alignment = ""; 723 724 my $last_end = 0; 725 726 foreach my $segment ( @sections ) 727 { 728 # start,end:length 729 $segment =~ /(\d+),(\d+):(\d+)/; 730 my $seq_start = $1; 731 my $align_start = $2; 732 my $length = $3; 733 734 if( $last_end ) 735 { 736 $alignment .= "-" x ($align_start - $last_end); 737 } 738 739 #print( "SEQ: $sequence\n\tSeq start: $seq_start, Align start: $align_start, Length: $length\n" ); 740 741 $alignment .= substr( $sequence, $seq_start-1, $length ); 742 $last_end = $align_start + $length; 743 } 744 745 return $alignment; 746 } 747 748 # compresses an alignment sequence based on the given start and end 749 sub align_compress { 750 751 my ( $alignment, $start, $end ) = @_; 752 753 my @sections; 754 755 my $position = 0; 756 757 my $segment = 0; 758 my $segment_start = 0; 759 my $align_start = 0; 760 761 for my $location ( 1 .. length( $alignment ) ) 762 { 763 my $aa = substr( $alignment, $location-1, 1 ); 764 $position++ if( $aa ne "-" ); 765 766 next if( $location < $start || $location > $end ); 767 768 if( $aa eq "-" ) 769 { 770 if( $segment ) 771 { 772 push( @sections, "$segment_start,$align_start:$segment" ); 773 $segment = 0; 774 $segment_start = 0; 775 $align_start = 0; 776 } 777 778 #print( "$aa\tGAP\tS_POS: $position\tA_POS: $location\tSEGMENT: $segment\tS_START: $segment_start\tA_START: $align_start\n" ); 779 } 780 else 781 { 782 $segment++; 783 784 if( $segment_start == 0 ) 785 { 786 $align_start = $location; 787 $segment_start = $position; 788 } 789 790 #print( "$aa\tAA\tS_POS: $position\tA_POS: $location\tSEGMENT: $segment\tS_START: $segment_start\tA_START: $align_start\n" ); 791 } 792 } 793 794 if( $segment ) 795 { 796 push( @sections, "$segment_start,$align_start:$segment" ); 797 } 798 799 return join(";", @sections ); 800 } 801 802 # loads FASTA sequences from a file 803 sub load_fasta_seqs { 804 my( $file ) = @_; 805 806 my %seqs; 807 my $seqio_object = Bio::SeqIO->new(-file => "$file"); 808 809 while( my $seq_obj = $seqio_object->next_seq ) { 810 if( !defined($seq_obj)) { 811 return \%seqs; 812 } 813 else { 814 $seqs{$seq_obj->id()} = $seq_obj->seq(); 815 } 816 } 817 818 return \%seqs; 819 } 820 821 ################################################################################ 822 # MATRIX I/O 823 ################################################################################ 824 825 # takes in a filename 826 # returns a hash reference to a matrix 827 sub read_linear_matrix { 828 my( $filename ) = @_; 829 my $matrix = &init_matrix(); 830 831 if( ! open( MATRIX, "$filename" ) ) 832 { 833 print STDERR "read_linear_matrix could not open \"$filename\" for reading!\n"; 834 return 0; 835 } 836 837 while( my $line = <MATRIX> ) 838 { 839 chomp( $line ); 840 my @row = split( /\t/, $line ); 841 my ($aa1, $aa2, $value ) = @row; 842 $matrix->{$aa1}{$aa2} = $value; 843 } 844 845 return $matrix; 846 } 847 848 # read in a matrix in linear format, use the value in the given column 849 # FROM the amino acids, starting at 0 850 sub read_linear_matrix_col { 851 my( $filename, $col ) = @_; 852 my $matrix = &init_matrix(); 853 854 if( ! open( MATRIX, "$filename" )) 855 { 856 print STDERR "read_linear_matrix could not open \"$filename\" for reading!\n"; 857 return 0; 858 } 859 860 while( my $line = <MATRIX> ) 861 { 862 chomp( $line ); 863 my @row = split( /\t/, $line ); 864 my $aa1 = shift( @row ); 865 my $aa2 = shift( @row ); 866 $matrix->{$aa1}{$aa2} = $row[$col]; 867 } 868 869 return $matrix; 870 } 871 872 #takes in a filename 873 #returns a hash reference of a matrix 874 sub read_square_matrix { 875 my( $filename ) = @_; 876 my $matrix = &init_matrix( ); 877 878 if( ! open( MATRIX, "$filename" ) ) 879 { 880 print STDERR "read_square_matrix could not open \"$filename\" for reading!\n"; 881 return 0; 882 } 883 884 my $line = <MATRIX>; 885 chomp( $line ); 886 my @columns = split( /\t/, $line ); 887 #get rid of AAS field 888 shift( @columns ); 889 890 while( $line = <MATRIX> ) 891 { 892 chomp( $line ); 893 my @row = split( /\t/, $line ); 894 my $row_aa = shift( @row ); 895 foreach my $col ( 0 .. $#row ) 896 { 897 $matrix->{$row_aa}{$columns[$col]} = $row[$col]; 898 print "$row_aa\t$columns[$col]\t$matrix->{$row_aa}{$columns[$col]}\n"; 899 } 900 } 901 902 print "RETURNING SQUARE MATRIX:\n"; 903 &print_square_matrix( $matrix, @aas ); 904 905 return $matrix; 906 } 907 908 #takes in a hash reference and a filename 909 sub write_linear_matrix { 910 my( $matrix, $filename ) = @_; 911 912 if( ! open( MATRIX, "> $filename" ) ) 913 { 914 print STDERR "write_linear_matrix could not open \"$filename\" for writing!\n"; 915 return 0; 916 } 917 918 foreach my $aa1 ( @sorted_aas ) 919 { 920 foreach my $aa2 ( @sorted_aas ) 921 { 922 if( $aa2 ge $aa1 || $matrix->{$aa1}{$aa2} ) 923 { 924 print MATRIX "$aa1\t$aa2\t".$matrix->{$aa1}{$aa2}."\n"; 925 } 926 } 927 } 928 929 close( MATRIX ); 930 return 1; 931 } 932 933 #takes in a hash reference and a filename 934 sub write_square_matrix { 935 my $matrix = shift( @_ ); 936 my $filename = shift( @_ ); 937 my @order = @_; 938 939 if( !@order ) 940 { 941 print STDERR "WARNING: NO ORDER GIVEN TO WRITE_SQUARE_MATRIX!\n"; 942 } 943 944 if( ! open( MATRIX, "> $filename" ) ) 945 { 946 print STDERR "write_square_matrix could not open \"$filename\" for writing!\n"; 947 return 0; 948 } 949 950 &out_square_matrix( $matrix, *MATRIX, @order ); 951 952 close( MATRIX ); 953 } 954 955 # takes in a hash reference and a filename 956 # writes an EMBOSS formatted matrix to file 957 sub write_emboss_matrix { 958 my $matrix = shift( @_ ); 959 my $filename = shift( @_ ); 960 my @order = @emboss_aas; 961 962 if( ! open( MATRIX, "> $filename" ) ) 963 { 964 print STDERR "write_emboss_matrix could not open \"$filename\" for writing!\n"; 965 return 0; 966 } 967 968 &out_emboss_matrix( $matrix, *MATRIX, @order ); 969 970 close( MATRIX ); 971 } 972 973 # prints an emboss matrix to screen 974 sub print_emboss_matrix { 975 my $matrix = shift( @_ ); 976 my @order = @emboss_aas; 977 978 &out_emboss_matrix( $matrix, *STDOUT, @order ); 979 } 980 981 # used by print_ and write_emboss_matrix to output an EMBOSS matrix 982 sub out_emboss_matrix 983 { 984 my $matrix = shift( @_ ); 985 local *FH = shift( @_ ); 986 my @order = @_; 987 988 # add all of the elements in an EMBOSS matrix we usually ignore 989 # and use the matrix's minimum for their likelihood 990 #push( @order, "*" ); 991 #my $min = &matrix_min( $matrix ); 992 #my $max = &matrix_max( $matrix ); 993 994 my $line = join( " ", "", @order ); 995 print FH " " . $line . "\n"; 996 997 foreach my $aa1 ( @order ) 998 { 999 my @line; 1000 1001 #row heading 1002 push( @line, $aa1 ); 1003 1004 foreach my $aa2 ( @order ) 1005 { 1006 # if( $aa1 eq "*" || $aa2 eq "*" || 1007 # $aa1 eq "B" || $aa2 eq "B" || 1008 # $aa1 eq "Z" || $aa2 eq "Z" || 1009 # $aa1 eq "X" || $aa2 eq "X" ) 1010 # { 1011 # if( exists($matrix->{$aa1}{$aa2})) 1012 # { 1013 # #push the value onto the line 1014 # push( @line, $matrix->{$aa1}{$aa2} ); 1015 # } 1016 # elsif( $aa1 eq $aa2 && $aa1 eq "*" ) 1017 # { 1018 # #EBLOSUM62 does this, so we will copy it 1019 # push( @line, 1 ); 1020 # } 1021 # else 1022 # { 1023 # push( @line, $min ); 1024 # } 1025 # } 1026 # else 1027 # { 1028 #push the value onto the line 1029 if( exists($matrix->{$aa1}{$aa2}) ) 1030 { 1031 push( @line, $matrix->{$aa1}{$aa2} ); 1032 } 1033 else 1034 { 1035 push( @line, "x" ); 1036 } 1037 #} 1038 #print "$aa1:$aa2 = $matrix->{$aa1}{$aa2}\n"; 1039 } 1040 1041 #space delineated 1042 my $line = join( " ", @line ); 1043 print FH $line . "\n"; 1044 } 1045 } 1046 1047 # loads an emboss matrix from the given filename 1048 # returns the hash reference for the matrix 1049 sub read_emboss_matrix 1050 { 1051 my( $filename ) = @_; 1052 my $matrix = &init_matrix( ); 1053 1054 if( ! open( MATRIX, "$filename" ) ) 1055 { 1056 print STDERR "read_emboss_matrix could not open \"$filename\" for reading!\n"; 1057 return 0; 1058 } 1059 1060 my $read_cols = 0; 1061 my @columns; 1062 1063 while( my $line = <MATRIX> ) 1064 { 1065 chomp( $line ); 1066 1067 #discard comments 1068 next if( substr( $line, 0, 1 ) eq "#" ); 1069 1070 #print $line . "\n"; 1071 1072 #get column headers 1073 if( $read_cols == 0 ) 1074 { 1075 chomp( $line ); 1076 @columns = split( /\s+/, $line ); 1077 #get rid of blank field 1078 shift( @columns ); 1079 #print "COLUMNS: " . @columns . "\n"; 1080 $read_cols = 1; 1081 next; 1082 } 1083 1084 my @row = split( /\s+/, $line ); 1085 my $row_aa = shift( @row ); 1086 1087 foreach my $col ( 0 .. $#row ) 1088 { 1089 $matrix->{$row_aa}{$columns[$col]} = $row[$col]; 1090 #print "R:$row_aa\tC:$columns[$col]\t$matrix->{$row_aa}{$columns[$col]}\n"; 1091 } 1092 } 1093 1094 #print "RETURNING SQUARE EMBOSS MATRIX:\n"; 1095 #&print_square_matrix( $matrix, @aas ); 1096 1097 return $matrix; 1098 } 1099 1100 sub print_square_matrix { 1101 my $matrix = shift( @_ ); 1102 my @order = @_; 1103 1104 if( !@order ) 1105 { 1106 print STDERR "WARNING: NO ORDER GIVEN TO PRINT_SQUARE_MATRIX!\n"; 1107 } 1108 1109 &out_square_matrix( $matrix, *STDOUT, @order ); 1110 } 1111 1112 sub out_square_matrix { 1113 my $matrix = shift( @_ ); 1114 local *FH = shift( @_ ); 1115 my @order = @_; 1116 1117 my $line = join( "\t", "AAS", @order ); 1118 print FH $line . "\n"; 1119 1120 foreach my $aa1 ( @order ) 1121 { 1122 my @line; 1123 1124 #row heading 1125 push( @line, $aa1 ); 1126 1127 foreach my $aa2 ( @order ) 1128 { 1129 #push the value onto the line 1130 push( @line, $matrix->{$aa1}{$aa2} ); 1131 #print "$aa1:$aa2 = $matrix->{$aa1}{$aa2}\n"; 1132 } 1133 1134 #tab delineated 1135 $line = join( "\t", @line ); 1136 print FH $line . "\n"; 1137 } 1138 } 1139 1140 sub write_sorted_square_matrix { 1141 1142 my( $matrix, $filename ) = @_; 1143 1144 if( ! open( MATRIX, "> $filename" ) ) 1145 { 1146 print STDERR "write_square_matrix could not open \"$filename\" for writing!\n"; 1147 return 0; 1148 } 1149 1150 @aas = sort( @aas ); 1151 1152 #not using sorted--want the standard order of amino acids 1153 foreach my $aa1 ( @aas ) 1154 { 1155 my @line; 1156 1157 foreach my $aa2 ( @aas ) 1158 { 1159 push( @line, $matrix->{$aa1}{$aa2} ); 1160 } 1161 1162 unshift( @line, $aa1 ); 1163 1164 my $line = join( "\t", @line ); 1165 print MATRIX $line . "\n"; 1166 } 1167 1168 close( MATRIX ); 1169 } 1170 1171 # reads a list of amino acids and their values from a tab delineated file 1172 sub read_aas { 1173 1174 my $file = shift( @_ ); 1175 my $aas = &init_aas(); 1176 1177 open( AACOUNTS, $file ) || warn "COULD NOT OPEN FILE $file!"; 1178 1179 while( my $line = <AACOUNTS> ) 1180 { 1181 my( $aa, $value ) = split( /\t/, $line ); 1182 1183 if( $value ) 1184 { 1185 $aas->{$aa} += $value; 1186 } 1187 } 1188 1189 close( AACOUNTS ); 1190 1191 return $aas; 1192 } 1193 1194 # initialize a hash of amino acid values 1195 sub init_aas { 1196 1197 my $aas = {}; 1198 1199 foreach my $aa ( @emboss_aas ) 1200 { 1201 $aas->{$aa} = 0; 1202 } 1203 1204 return $aas; 1205 } 1206 1207 # output a hash of amino acid values according to the given order 1208 sub out_aas { 1209 my $aas = shift( @_ ); 1210 local *FH = shift( @_ ); 1211 my @order = @_; 1212 1213 foreach my $aa ( @order ) 1214 { 1215 print FH $aa . "\t" . $aas->{$aa} . "\n"; 1216 } 1217 } 1218 1219 # print amino acids and their values to the screen according 1220 # to the given order 1221 sub print_aas { 1222 my $aas = shift( @_ ); 1223 my @order = @_; 1224 1225 if( !@order ) 1226 { 1227 print STDERR "WARNING: NO ORDER GIVEN TO PRINT_AAS!\n"; 1228 } 1229 1230 &out_aas( $aas, *STDOUT, @order ); 1231 } 1232 1233 # write amino acids and their values to the given filename 1234 # depends on the aas listed in order 1235 sub write_aas_limited { 1236 my $aas = shift( @_ ); 1237 my $filename = shift( @_ ); 1238 my @order = @_; 1239 1240 if( ! open( AAS, "> $filename" ) ) 1241 { 1242 print STDERR "write_aas could not open \"$filename\" for writing!\n"; 1243 return 0; 1244 } 1245 1246 &out_aas( $aas, *AAS, @order ); 1247 1248 close( AAS ); 1249 } 1250 1251 # writes the EMBOSS aas and their values to the given filename 1252 sub write_aas { 1253 my $aas = shift( @_ ); 1254 my $filename = shift( @_ ); 1255 1256 if( ! open( AAS, "> $filename" ) ) 1257 { 1258 print STDERR "write_aas could not open \"$filename\" for writing!\n"; 1259 return 0; 1260 } 1261 1262 &out_aas( $aas, *AAS, @emboss_aas ); 1263 1264 close( AAS ); 1265 } 1266 1267 ################################################################################ 1268 # MATRIX MATH AND MANIPULATION 1269 ################################################################################ 1270 1271 #takes in two hash references -- a - b = diff 1272 #returns a hash reference 1273 sub diff_matrix { 1274 my( $matrix_a, $matrix_b ) = @_; 1275 my $matrix_diff = &init_matrix(); 1276 1277 #only use actual aas 1278 foreach my $aa1 ( @aas ) 1279 { 1280 foreach my $aa2 ( @aas ) 1281 { 1282 $matrix_diff->{$aa1}{$aa2} = 1283 $matrix_a->{$aa1}{$aa2} - $matrix_b->{$aa1}{$aa2}; 1284 1285 #print "diff $aa1:$aa2 = $matrix_diff->{$aa1}{$aa2} ($matrix_a->{$aa1}{$aa2} - $matrix_b->{$aa1}{$aa2})\n"; 1286 } 1287 } 1288 1289 #print "DIFF_MATRIX TEST: \n"; 1290 #&print_square_matrix( $matrix_diff, @aas ); 1291 1292 return $matrix_diff; 1293 } 1294 1295 # returns the absolute difference of values between 1296 # matrix a and b 1297 sub abs_diff_matrix { 1298 my( $matrix_a, $matrix_b ) = @_; 1299 1300 my $abs_diff = 0; 1301 1302 # only use actual amino acids 1303 foreach my $aa1 ( @aas ) { 1304 foreach my $aa2 ( @aas ) { 1305 next if( $aa2 lt $aa1 ); 1306 1307 $abs_diff += abs( $matrix_a->{$aa1}{$aa2} - $matrix_b->{$aa1}{$aa2} ); 1308 }} 1309 1310 return $abs_diff; 1311 } 1312 1313 #takes in a hash ref, amino acid 1, amino acid 2, and numerical value 1314 #sets top only by ALPHABETICAL ORDER 1315 sub set_matrix_value { 1316 my( $matrix, $aa1, $aa2, $value ) = @_; 1317 1318 if( $aa1 gt $aa2 ) 1319 { 1320 my $switch = $aa2; 1321 $aa2 = $aa1; 1322 $aa1 = $switch; 1323 } 1324 1325 $matrix->{$aa1}{$aa2} = $value; 1326 } 1327 1328 #takes in a hash ref, amino acid 1, amino acid 2 1329 #returns numerical value 1330 #from top only by ALPHABETICAL ORDER 1331 sub get_matrix_value { 1332 my( $matrix, $aa1, $aa2 ) = @_; 1333 1334 if( $aa1 gt $aa2 ) 1335 { 1336 my $switch = $aa2; 1337 $aa2 = $aa1; 1338 $aa1 = $switch; 1339 } 1340 1341 #print "Getting $matrix->{$aa1}{$aa2} from $aa1:$aa2\n"; 1342 1343 return $matrix->{$aa1}{$aa2}; 1344 } 1345 1346 #initializes all amino acid pairs to 0 for all given hash refs 1347 sub init_matrix { 1348 my %matrix; 1349 1350 foreach my $aa1 ( @emboss_aas ) 1351 { 1352 foreach my $aa2 ( @emboss_aas ) 1353 { 1354 $matrix{$aa1}{$aa2} = 0; 1355 } 1356 } 1357 1358 return \%matrix; 1359 } 1360 1361 #copies the given matrix and returns a reference to the copy 1362 sub copy_matrix { 1363 my $orig = shift( @_ ); 1364 my $copy = &init_matrix(); 1365 1366 foreach my $aa1 ( @sorted_aas ) 1367 { 1368 foreach my $aa2 ( @sorted_aas ) 1369 { 1370 $copy->{$aa1}{$aa2} = $orig->{$aa1}{$aa2}; 1371 } 1372 } 1373 1374 return $copy; 1375 } 1376 1377 # makes a copy of a hash reference of aas 1378 sub copy_aas { 1379 my $orig = shift( @_ ); 1380 my $copy = &init_matrix(); 1381 1382 foreach my $aa ( keys(%$orig) ) 1383 { 1384 $copy->{$aa} = $orig->{$aa}; 1385 } 1386 1387 return $copy; 1388 } 1389 1390 # finds the minimum value in the matrix 1391 sub matrix_min { 1392 1393 my $matrix = shift( @_ ); 1394 1395 my $min = $matrix->{$aas[0]}{$aas[0]}; 1396 1397 foreach my $aa1 ( @emboss_aas ) 1398 { 1399 foreach my $aa2 ( @emboss_aas ) 1400 { 1401 if( $matrix->{$aa1}{$aa2} < $min ) 1402 { 1403 $min = $matrix->{$aa1}{$aa2}; 1404 } 1405 } 1406 } 1407 1408 return $min; 1409 } 1410 1411 # finds the maximum value of the matrix 1412 sub matrix_max { 1413 1414 my $matrix = shift( @_ ); 1415 1416 my $max = $matrix->{$aas[0]}{$aas[0]}; 1417 1418 foreach my $aa1 ( @emboss_aas ) 1419 { 1420 foreach my $aa2 ( @emboss_aas ) 1421 { 1422 if( $matrix->{$aa1}{$aa2} > $max ) 1423 { 1424 $max = $matrix->{$aa1}{$aa2}; 1425 } 1426 } 1427 } 1428 1429 return $max; 1430 } 1431 1432 # merges two matrices, taking the min value for each position 1433 sub min_merge 1434 { 1435 my( $a, $b ) = @_; 1436 my $c = &init_matrix(); 1437 1438 #print "MERGING MIN\n"; 1439 1440 foreach my $aa1 ( @emboss_aas ) 1441 { 1442 foreach my $aa2 ( @emboss_aas ) 1443 { 1444 if( $a->{$aa1}{$aa2} > $b->{$aa1}{$aa2} ) 1445 { 1446 #print "MIN:\t".$a->{$aa1}{$aa2}.">*".$b->{$aa1}{$aa2}."\n"; 1447 $c->{$aa1}{$aa2} = $b->{$aa1}{$aa2}; 1448 } 1449 else 1450 { 1451 #print "MIN:\t".$a->{$aa1}{$aa2}."*<".$b->{$aa1}{$aa2}."\n"; 1452 $c->{$aa1}{$aa2} = $a->{$aa1}{$aa2}; 1453 } 1454 } 1455 } 1456 1457 return $c; 1458 } 1459 1460 # merges two matrices, taking the max value for each position 1461 sub max_merge 1462 { 1463 my( $a, $b ) = @_; 1464 my $c = &init_matrix(); 1465 1466 #print "MERGING MAX\n"; 1467 1468 foreach my $aa1 ( @emboss_aas ) 1469 { 1470 foreach my $aa2 ( @emboss_aas ) 1471 { 1472 if( $a->{$aa1}{$aa2} > $b->{$aa1}{$aa2} ) 1473 { 1474 #print "MAX:\t".$a->{$aa1}{$aa2}."*>".$b->{$aa1}{$aa2}."\n"; 1475 $c->{$aa1}{$aa2} = $a->{$aa1}{$aa2}; 1476 } 1477 else 1478 { 1479 #print "MAX:\t".$a->{$aa1}{$aa2}."<*".$b->{$aa1}{$aa2}."\n"; 1480 $c->{$aa1}{$aa2} = $b->{$aa1}{$aa2}; 1481 } 1482 } 1483 } 1484 1485 return $c; 1486 } 1487 1488 # finds the total of all the values in the matrix 1489 sub matrix_all_total { 1490 1491 my $matrix = shift( @_ ); 1492 my $total = 0; 1493 1494 foreach my $aa1 ( @aas ) 1495 { 1496 foreach my $aa2 ( @aas ) 1497 { 1498 $total += $matrix->{$aa1}{$aa2}; 1499 } 1500 } 1501 1502 return $total; 1503 } 1504 1505 # finds the total of all the values in the upper diagonal 1506 # of the matrix 1507 sub matrix_total { 1508 1509 my $matrix = shift( @_ ); 1510 my $total = 0; 1511 1512 foreach my $aa1 ( @aas ) 1513 { 1514 foreach my $aa2 ( @aas ) 1515 { 1516 next if( $aa1 gt $aa2 ); 1517 1518 if( $aa1 le $aa2 ) 1519 { 1520 $total += $matrix->{$aa1}{$aa2}; 1521 } 1522 } 1523 } 1524 1525 return $total; 1526 } 1527 1528 # finds the total of the absolute values in the matrix summed, 1529 # upper diagonal values only 1530 sub matrix_abs_total { 1531 1532 my $matrix = shift( @_ ); 1533 my $total = 0; 1534 1535 foreach my $aa1 ( @aas ) 1536 { 1537 foreach my $aa2 ( @aas ) 1538 { 1539 next if( $aa1 gt $aa2 ); 1540 1541 if( $aa1 le $aa2 ) 1542 { 1543 $total += abs($matrix->{$aa1}{$aa2}); 1544 } 1545 } 1546 } 1547 1548 return $total; 1549 } 1550 1551 # finds the total of the values on the diagonal of the matrix 1552 sub diag_total { 1553 1554 my $matrix = shift( @_ ); 1555 my $total = 0; 1556 1557 foreach my $aa ( @aas ) 1558 { 1559 $total += $matrix->{$aa}{$aa}; 1560 } 1561 1562 return $total; 1563 } 1564 1565 # Finds the total of the matrix and divides each value 1566 # by that total so the matrix will sum to 1 1567 sub normalize_matrix 1568 { 1569 my( $matrix ) = @_; 1570 my $matrix_n = &init_matrix(); 1571 1572 my $total = &matrix_total( $matrix ); 1573 1574 foreach my $aa1 ( @emboss_aas ) 1575 { 1576 foreach my $aa2 ( @emboss_aas ) 1577 { 1578 if( $aa1 le $aa2 ) 1579 { 1580 $matrix_n->{$aa1}{$aa2} = $matrix->{$aa1}{$aa2} / $total; 1581 } 1582 } 1583 } 1584 1585 return $matrix_n; 1586 } 1587 1588 # sums all the values of a list of aas 1589 sub total_aas 1590 { 1591 my( $aas ) = @_; 1592 my $total = 0; 1593 1594 foreach my $aa (@emboss_aas ) 1595 { 1596 $total += $aas->{$aa}; 1597 } 1598 1599 return $total; 1600 } 1601 1602 # finds the log odds without rounding 1603 sub raw_log_odds 1604 { 1605 my( $transitions, $aas ) = @_; 1606 my $log_odds_matrix = &init_matrix(); 1607 my $C = 2; 1608 1609 #initialize to impossible value 1610 my $lowest_nonzero = 10000000; 1611 1612 #find the lowest 1613 foreach my $aa1 ( @emboss_aas ) 1614 { 1615 foreach my $aa2 ( @emboss_aas ) 1616 { 1617 $lowest_nonzero = $transitions->{$aa1}{$aa2} 1618 if( $transitions->{$aa1}{$aa2} < $lowest_nonzero 1619 && $transitions->{$aa1}{$aa2} != 0 ); 1620 } 1621 } 1622 1623 $lowest_nonzero /= 2; 1624 1625 #all 0s = lowest/2 1626 foreach my $aa1 ( @emboss_aas ) 1627 { 1628 foreach my $aa2 ( @emboss_aas ) 1629 { 1630 if( $aa1 gt $aa2 ) 1631 { 1632 next; 1633 } 1634 1635 if( $transitions->{$aa1}{$aa2} == 0 ) 1636 { 1637 $transitions->{$aa1}{$aa2} = $lowest_nonzero; 1638 } 1639 } 1640 } 1641 1642 #initialize to impossible value 1643 $lowest_nonzero = 1000000; 1644 1645 #find the lowest 1646 foreach my $aa1 ( @emboss_aas ) 1647 { 1648 $lowest_nonzero = $aas->{$aa1} 1649 if( $aas->{$aa1} < $lowest_nonzero 1650 && $aas->{$aa1} != 0 ); 1651 } 1652 1653 $lowest_nonzero /= 2; 1654 1655 #all 0s = lowest/2 1656 foreach my $aa1 ( @emboss_aas ) 1657 { 1658 if( $aas->{$aa1} == 0 ) 1659 { 1660 $aas->{$aa1} = $lowest_nonzero; 1661 } 1662 } 1663 1664 foreach my $aa1 ( @emboss_aas ) 1665 { 1666 foreach my $aa2 ( @emboss_aas ) 1667 { 1668 if( $aa1 gt $aa2 ) 1669 { 1670 next; 1671 } 1672 1673 #if there's a 0 we stay at 0 1674 die "ERROR: $aa1-$aa2 NORMALIZED AT 0 AND WASN'T SET" if( $transitions->{$aa1}{$aa2} == 0 ); 1675 1676 my $random_prob = $aas->{$aa1} * $aas->{$aa2}; 1677 1678 if( !$random_prob ) 1679 { 1680 print "Strange: $aa1 versus $aa2 has normal of ".$transitions->{$aa1}{$aa2}." but no random prob\n"; 1681 next; 1682 } 1683 1684 #actual over expected probabilities of matching up 1685 my $p = $transitions->{$aa1}{$aa2} / ( $random_prob ); 1686 my $log_odds = $C * logn( $p, 2 ); 1687 $log_odds_matrix->{$aa1}{$aa2} = $log_odds; 1688 #print "$aa1 versus $aa2 ( $aas->{$aa1}, $aas->{$aa2}, $transitions->{$aa1}{$aa2} ) log odds: ".$log_odds_matrix->{$aa1}{$aa2}." ( $log_odds )\n"; 1689 } 1690 } 1691 1692 &mirror_matrix( $log_odds_matrix, @emboss_aas ); 1693 1694 return $log_odds_matrix; 1695 } 1696 1697 # log odds rounded to the nearest integer 1698 sub log_odds 1699 { 1700 my( $transitions, $aas ) = @_; 1701 my $log_odds_matrix = &init_matrix(); 1702 my $C = 2; 1703 1704 #initialize to impossible value 1705 my $lowest_nonzero = 10000000; 1706 1707 #find the lowest 1708 foreach my $aa1 ( @emboss_aas ) 1709 { 1710 foreach my $aa2 ( @emboss_aas ) 1711 { 1712 $lowest_nonzero = $transitions->{$aa1}{$aa2} 1713 if( $transitions->{$aa1}{$aa2} < $lowest_nonzero 1714 && $transitions->{$aa1}{$aa2} != 0 ); 1715 } 1716 } 1717 1718 $lowest_nonzero /= 2; 1719 1720 #all 0s = lowest/2 1721 foreach my $aa1 ( @emboss_aas ) 1722 { 1723 foreach my $aa2 ( @emboss_aas ) 1724 { 1725 if( $aa1 gt $aa2 ) 1726 { 1727 next; 1728 } 1729 1730 if( $transitions->{$aa1}{$aa2} == 0 ) 1731 { 1732 $transitions->{$aa1}{$aa2} = $lowest_nonzero; 1733 } 1734 } 1735 } 1736 1737 #initialize to impossible value 1738 $lowest_nonzero = 1000000; 1739 1740 #find the lowest 1741 foreach my $aa1 ( @emboss_aas ) 1742 { 1743 $lowest_nonzero = $aas->{$aa1} 1744 if( $aas->{$aa1} < $lowest_nonzero 1745 && $aas->{$aa1} != 0 ); 1746 } 1747 1748 $lowest_nonzero /= 2; 1749 1750 #all 0s = lowest/2 1751 foreach my $aa1 ( @emboss_aas ) 1752 { 1753 if( $aas->{$aa1} == 0 ) 1754 { 1755 $aas->{$aa1} = $lowest_nonzero; 1756 } 1757 } 1758 1759 foreach my $aa1 ( @emboss_aas ) 1760 { 1761 foreach my $aa2 ( @emboss_aas ) 1762 { 1763 if( $aa1 gt $aa2 ) 1764 { 1765 next; 1766 } 1767 1768 #if there's a 0 we stay at 0 1769 die "ERROR: $aa1-$aa2 NORMALIZED AT 0 AND WASN'T SET" if( $transitions->{$aa1}{$aa2} == 0 ); 1770 1771 my $random_prob = $aas->{$aa1} * $aas->{$aa2}; 1772 1773 if( !$random_prob ) 1774 { 1775 print "Strange: $aa1 versus $aa2 has normal of ".$transitions->{$aa1}{$aa2}." but no random prob\n"; 1776 next; 1777 } 1778 1779 #actual over expected probabilities of matching up 1780 my $p = $transitions->{$aa1}{$aa2} / ( $random_prob ); 1781 my $log_odds = $C * logn( $p, 2 ); 1782 $log_odds_matrix->{$aa1}{$aa2} = &round($log_odds); 1783 #print "$aa1 versus $aa2 ( $aas->{$aa1}, $aas->{$aa2}, $transitions->{$aa1}{$aa2} ) log odds: ".$log_odds_matrix->{$aa1}{$aa2}." ( $log_odds )\n"; 1784 } 1785 } 1786 1787 &mirror_matrix( $log_odds_matrix, @emboss_aas ); 1788 1789 return $log_odds_matrix; 1790 } 1791 1792 # divides all aa values by the total of all values 1793 # so they sum to 1 1794 sub normalize_aas 1795 { 1796 my( $aas ) = @_; 1797 my %norm; 1798 1799 my $total = &total_aas( $aas ); 1800 1801 foreach my $aa (@emboss_aas ) 1802 { 1803 $norm{$aa} = $aas->{$aa} / $total; 1804 } 1805 1806 return \%norm; 1807 } 1808 1809 # calculates the amino acid frequencies in a matrix 1810 sub calc_aa_freq { 1811 my $matrix = shift(@_); 1812 &mirror_matrix($matrix); 1813 &non_diagonal_half($matrix); 1814 my $aa_freq = &init_aas(); 1815 my $matrix_total = &matrix_all_total($matrix); 1816 1817 for my $aa (@aas) 1818 { 1819 $aa_freq->{$aa} = col_sum($matrix, $aa) / $matrix_total; 1820 } 1821 1822 return $aa_freq; 1823 } 1824 1825 # sum two sets of corresponding aa values into another list of aa values 1826 sub sum_aas 1827 { 1828 my( $aas_a, $aas_b ) = @_; 1829 my $c = &init_aas(); 1830 1831 foreach my $aa ( @emboss_aas ) 1832 { 1833 $c->{$aa} = $aas_a->{$aa} + $aas_b->{$aa}; 1834 } 1835 1836 return $c; 1837 } 1838 1839 # sum a row on a matrix 1840 sub row_sum 1841 { 1842 my( $matrix, $aa1 ) = @_; 1843 my $total = 0; 1844 1845 for my $aa2 (@aas) 1846 { 1847 $total += $matrix->{$aa1}{$aa2}; 1848 } 1849 1850 return $total; 1851 } 1852 1853 # cuts all non diagonal matrix values in half 1854 sub non_diagonal_half { 1855 my( $matrix ) = @_; 1856 1857 for my $aa1 (@aas) { 1858 for my $aa2 (@aas) { 1859 $matrix->{$aa1}{$aa2} /= 2 if( $aa1 ne $aa2 ); 1860 } 1861 } 1862 } 1863 1864 # sums a column on a matrix 1865 sub col_sum 1866 { 1867 my( $matrix, $aa2 ) = @_; 1868 my $total = 0; 1869 1870 #print "Col sum for $aa2: "; 1871 1872 for my $aa1 (@aas) 1873 { 1874 #print "$aa1 ($matrix->{$aa1}{$aa2})\t"; 1875 $total += $matrix->{$aa1}{$aa2}; 1876 } 1877 1878 #print "\n"; 1879 1880 return $total; 1881 } 1882 1883 # returns the difference of corresponding values on two aa lists 1884 sub diff_aas 1885 { 1886 my( $aas_a, $aas_b ) = @_; 1887 my $c = &init_aas(); 1888 1889 foreach my $aa ( @emboss_aas ) 1890 { 1891 $c->{$aa} = $aas_a->{$aa} - $aas_b->{$aa}; 1892 } 1893 1894 return $c; 1895 } 1896 1897 #takes in a hash reference and mirrors it based on the input of aa order 1898 sub mirror_matrix { 1899 my $matrix = shift( @_ ); 1900 my @order = @emboss_aas; 1901 1902 if( !@order ) 1903 { 1904 print STDERR "WARNING: NO ORDER GIVEN TO MIRROR_MATRIX!\n"; 1905 } 1906 1907 foreach my $aa1 ( @order ) 1908 { 1909 foreach my $aa2 ( @order ) 1910 { 1911 if( $aa1 gt $aa2 ) 1912 { 1913 if( ! $matrix->{$aa1}{$aa2} ) 1914 { 1915 $matrix->{$aa1}{$aa2} = $matrix->{$aa2}{$aa1}; 1916 } 1917 elsif( ! $matrix->{$aa2}{$aa1} ) 1918 { 1919 $matrix->{$aa2}{$aa1} = $matrix->{$aa1}{$aa2}; 1920 } 1921 } 1922 } 1923 } 1924 } 1925 1926 #removes the lower half of the matrix to make it a top triangular matrix 1927 #requires an input of the order in which to cut! 1928 sub top_matrix { 1929 my $matrix = shift( @_ ); 1930 my @order = @_; 1931 1932 if( !@order ) 1933 { 1934 print STDERR "WARNING: NO ORDER GIVEN TO TOP_MATRIX!\n"; 1935 } 1936 1937 foreach my $aa1 ( @order ) 1938 { 1939 foreach my $aa2 ( @order ) 1940 { 1941 if( $aa1 gt $aa2 ) 1942 { 1943 $matrix->{$aa1}{$aa2} = 0 ; 1944 } 1945 } 1946 } 1947 } 1948 1949 #scales a matrix -- multiplies each member by the given scale 1950 sub scale_matrix { 1951 my $matrix = shift( @_ ); 1952 my $scale = shift( @_ ); 1953 1954 foreach my $aa1 ( @emboss_aas ) 1955 { 1956 foreach my $aa2 ( @emboss_aas ) 1957 { 1958 $matrix->{$aa1}{$aa2} = $matrix->{$aa1}{$aa2} * $scale; 1959 } 1960 } 1961 } 1962 1963 #scales aas -- multiplies each member by the given scale 1964 sub scale_aas { 1965 my $aas = shift( @_ ); 1966 my $scale = shift( @_ ); 1967 1968 foreach my $aa ( @emboss_aas ) 1969 { 1970 $aas->{$aa} = $aas->{$aa} * $scale; 1971 } 1972 } 1973 1974 #multiplies matrix a against matrix b with a given @order 1975 sub mult_matrix { 1976 my $a = shift( @_ ); 1977 my $b = shift( @_ ); 1978 my @order = @_; 1979 my $c = &init_matrix(); 1980 1981 if( !@order ) 1982 { 1983 print STDERR "WARNING: NO ORDER GIVEN TO MULT_MATRIX!\n"; 1984 } 1985 1986 foreach my $i ( 0 .. $#order ) 1987 { 1988 foreach my $j ( 0 .. $#order ) 1989 { 1990 my $total = 0; 1991 1992 foreach my $x ( 0 .. $#order ) 1993 { 1994 $total += $a->{$order[$i]}{$order[$x]} * 1995 $b->{$order[$x]}{$order[$j]}; 1996 } 1997 1998 $c->{$order[$i]}{$order[$j]} = $total; 1999 } 2000 } 2001 2002 return $c; 2003 } 2004 2005 # adds two matrices together 2006 sub sum_matrix { 2007 my $a = shift( @_ ); 2008 my $b = shift( @_ ); 2009 my @order = @_; 2010 my $c = &init_matrix(); 2011 2012 if( !@order ) 2013 { 2014 print STDERR "WARNING: NO ORDER GIVEN TO SUM MATRIX!\n"; 2015 } 2016 2017 foreach my $aa1 ( @order ) 2018 { 2019 foreach my $aa2 ( @order ) 2020 { 2021 $c->{$aa1}{$aa2} = $a->{$aa1}{$aa2} + $b->{$aa1}{$aa2} if( exists( $c->{$aa1}{$aa2})); 2022 } 2023 } 2024 2025 return $c; 2026 } 2027 2028 # transposes the matrix -- ie Mab = Mba 2029 sub transpose_matrix { 2030 my $matrix = shift( @_ ); 2031 my $matrix_t = &init_matrix(); 2032 2033 foreach my $aa1 ( @sorted_aas ) 2034 { 2035 foreach my $aa2 ( @sorted_aas ) 2036 { 2037 $matrix_t->{$aa1}{$aa2} = $matrix->{$aa2}{$aa1}; 2038 } 2039 } 2040 2041 return $matrix_t; 2042 } 2043 2044 # creates an identity matrix -- all values 0 except on the diagonal 2045 sub identity_matrix { 2046 my $matrix = &init_matrix(); 2047 2048 foreach my $aa ( @sorted_aas ) 2049 { 2050 #print "$aa:$aa = 1\n"; 2051 $matrix->{$aa}{$aa} = 1; 2052 } 2053 2054 return $matrix; 2055 } 2056 2057 # calculates the log odds of the matrix, also puts in values for special 2058 # aas BXZ*; takes X values from the given xmatrix file 2059 sub scale_log_odds { 2060 my( $normalized_matrix, $aa_percent, $xmatrix_filename ) = @_; 2061 2062 my $C = 2; 2063 2064 #initialize to impossible value 2065 my $lowest_nonzero = 1000000; 2066 2067 #find the lowest 2068 foreach my $aa1 ( @emboss_aas ) 2069 { 2070 foreach my $aa2 ( @emboss_aas ) 2071 { 2072 my $norm = $normalized_matrix->{$aa1}{$aa2}; 2073 2074 if( defined($norm) && 2075 $norm < $lowest_nonzero 2076 && $norm > 0) { 2077 $lowest_nonzero = $norm; 2078 } 2079 } 2080 } 2081 2082 $lowest_nonzero /= 2; 2083 2084 #all 0s = lowest/2 2085 foreach my $aa1 ( @emboss_aas ) 2086 { 2087 foreach my $aa2 ( @emboss_aas ) 2088 { 2089 if( $aa1 gt $aa2 ) 2090 { 2091 next; 2092 } 2093 2094 if( $normalized_matrix->{$aa1}{$aa2} == 0 ) 2095 { 2096 $normalized_matrix->{$aa1}{$aa2} = $lowest_nonzero; 2097 } 2098 } 2099 } 2100 2101 #initialize to impossible value 2102 $lowest_nonzero = 1000000; 2103 2104 #find the lowest 2105 foreach my $aa1 ( @emboss_aas ) 2106 { 2107 $lowest_nonzero = $aa_percent->{$aa1} 2108 if( exists($aa_percent->{$aa1}) && $aa_percent->{$aa1} < $lowest_nonzero 2109 && $aa_percent->{$aa1} != 0 ); 2110 } 2111 2112 $lowest_nonzero /= 2; 2113 2114 #all 0s = lowest/2 2115 foreach my $aa1 ( @emboss_aas ) 2116 { 2117 if( ! exists($aa_percent->{$aa1}) || $aa_percent->{$aa1} == 0 ) 2118 { 2119 $aa_percent->{$aa1} = $lowest_nonzero; 2120 } 2121 } 2122 2123 my $log_odds_matrix = &init_matrix(); 2124 my $log_min = 0; 2125 2126 foreach my $aa1 ( @emboss_aas ) 2127 { 2128 foreach my $aa2 ( @emboss_aas ) 2129 { 2130 if( $aa1 gt $aa2 ) 2131 { 2132 next; 2133 } 2134 2135 if($aa1 eq "*" || $aa2 eq "*") 2136 { 2137 next; 2138 } 2139 2140 #if there's a 0 we stay at 0 2141 die "ERROR: $aa1:$aa2 NORMALIZED AT 0 AND WASN'T SET" if( $normalized_matrix->{$aa1}{$aa2} == 0 ); 2142 2143 my $random_prob = $aa_percent->{$aa1} * $aa_percent->{$aa2}; 2144 2145 if( !$random_prob ) 2146 { 2147 print "Strange: $aa1 versus $aa2 has normal of ".$normalized_matrix->{$aa1}{$aa2}." but no random prob\n"; 2148 next; 2149 } 2150 2151 #actual over expected probabilities of matching up 2152 my $p = $normalized_matrix->{$aa1}{$aa2} / ( $random_prob ); 2153 $log_odds_matrix->{$aa1}{$aa2} = $C * logn( $p, 2 ); 2154 #print "$aa1 versus $aa2 log odds: ".$log_odds_matrix{$aa1}{$aa2}."\n"; 2155 $log_min = $log_odds_matrix->{$aa1}{$aa2} if($log_odds_matrix->{$aa1}{$aa2} < $log_min); 2156 } 2157 } 2158 2159 #print "LOG ODDS: \n"; 2160 #&print_emboss_matrix( $log_odds_matrix ); 2161 &mirror_matrix( $log_odds_matrix, @emboss_aas, "*" ); 2162 2163 my %special_cases; 2164 $special_cases{"B"} = ["N","D"]; 2165 $special_cases{"Z"} = ["Q","E"]; 2166 $special_cases{"X"} = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', "B", "Z"]; 2167 2168 foreach my $special_aa (keys(%special_cases)) 2169 { 2170 my $components = $special_cases{$special_aa}; 2171 my @components = @$components; 2172 2173 foreach my $aa (@emboss_aas) 2174 { 2175 my $total = 0; 2176 my $components = 0; 2177 2178 foreach my $component (@components) 2179 { 2180 my $value = &get_matrix_value($log_odds_matrix, $aa, $component); 2181 $total += $value; 2182 #print "Adding $value from $aa:$component ($special_aa)...total = $total\n"; 2183 $components++; 2184 } 2185 2186 my $average = $total / $components; 2187 2188 #print "Average of $special_aa:$aa: $average\n"; 2189 2190 &set_matrix_value($log_odds_matrix, $special_aa, $aa, $average); 2191 } 2192 } 2193 2194 #PRINT LOG ODDS MATRIX 2195 #open( MATRIX, "> $matrix_file" ) || die "Could not open matrix file!"; 2196 2197 #print "Log min: $log_min\n"; 2198 2199 # set the * to the log odds minimum except for *:* which is 1 2200 foreach my $aa ( @emboss_aas ) 2201 { 2202 if($aa eq "*") 2203 { 2204 $log_odds_matrix->{$aa}{"*"} = 1; 2205 } 2206 else 2207 { 2208 $log_odds_matrix->{$aa}{"*"} = $log_min; 2209 $log_odds_matrix->{"*"}{$aa} = $log_min; 2210 } 2211 } 2212 2213 my $nn_value = &get_matrix_value($log_odds_matrix, "N", "N"); 2214 my $dd_value = &get_matrix_value($log_odds_matrix, "D", "D"); 2215 2216 if( $nn_value < $dd_value ) 2217 { 2218 #print "Replacing B:B with the N:N ($nn_value) value over D:D ($dd_value)...\n"; 2219 &set_matrix_value( $log_odds_matrix, "N", "N", $nn_value ); 2220 } 2221 else 2222 { 2223 #print "Replacing B:B with the D:D ($dd_value) value over N:N ($nn_value)...\n"; 2224 &set_matrix_value( $log_odds_matrix, "D", "D", $dd_value ); 2225 } 2226 2227 my $qq_value = &get_matrix_value($log_odds_matrix, "Q", "Q"); 2228 my $ee_value = &get_matrix_value($log_odds_matrix, "E", "E"); 2229 2230 if( $qq_value < $ee_value ) 2231 { 2232 #print "Replacing Z:Z with the Q:Q ($qq_value) value over E:E ($ee_value)...\n"; 2233 &set_matrix_value( $log_odds_matrix, "Q", "Q", $nn_value ); 2234 } 2235 else 2236 { 2237 #print "Replacing Z:Z with the E:E ($ee_value) value over Q:Q ($qq_value)...\n"; 2238 &set_matrix_value( $log_odds_matrix, "E", "E", $dd_value ); 2239 } 2240 2241 my $rounded_log_odds = &init_matrix(); 2242 2243 foreach my $aa1 ( @emboss_aas, "*" ) 2244 { 2245 foreach my $aa2 ( @emboss_aas, "*" ) 2246 { 2247 #keep these just to see what the numbers are 2248 #next if( $aa1 eq "-" || $aa2 eq "-" ); 2249 #if( $aa2 ge $aa1 ) 2250 #{ 2251 #print MATRIX "$aa1\t$aa2\t".$log_odds_matrix{$aa1}{$aa2}."\t".&round($log_odds_matrix{$aa1}{$aa2})."\n"; 2252 #} 2253 2254 next if( $aa1 gt $aa2 ); 2255 2256 if( $log_odds_matrix->{$aa1}{$aa2} ) 2257 { 2258 $rounded_log_odds->{$aa1}{$aa2} = &round($log_odds_matrix->{$aa1}{$aa2}); 2259 } 2260 else 2261 { 2262 print "No log odds at $aa1:$aa2\n"; 2263 } 2264 } 2265 } 2266 2267 # replace Xs with the ones from the x matrix 2268 my $x_matrix = &read_emboss_matrix($xmatrix_filename) || die "Could not open X replacement matrix $xmatrix_filename!\n"; 2269 2270 #print "X Replacement Matrix\n"; 2271 #&print_emboss_matrix($x_matrix); 2272 2273 # rounding matrix 2274 foreach my $aa ( @emboss_aas ) 2275 { 2276 #don't mess with *--it should stay the minimum 2277 next if ($aa eq "*"); 2278 2279 my $x_value = &get_matrix_value( $x_matrix, "X", $aa ); 2280 my $old_x = &get_matrix_value( $rounded_log_odds, "X", $aa ); 2281 2282 die "DYING: No replacement X value for X:$aa, old value is $old_x" if( !defined($x_value) ); 2283 2284 #print "Replacing X:$aa = $old_x with $x_value...\n"; 2285 2286 &set_matrix_value( $rounded_log_odds, "X", $aa, $x_value ); 2287 } 2288 2289 $rounded_log_odds->{"*"}{"*"} = 1; 2290 &mirror_matrix( $rounded_log_odds, @emboss_aas, "*" ); 2291 2292 return $rounded_log_odds; 2293 } 2294 1;