#!/usr/bin/perl -w ############################################################################# # # # copyright : (C) 2002 by David J. Studholme # # email : djs@sanger.ac.uk # # # ############################################################################# ############################################################################# #* # #* This program is free software; you can redistribute it and/or modify # #* it under the terms of the GNU General Public License as published by # #* the Free Software Foundation; either version 2 of the License, or # #* (at your option) any later version. # #* # ############################################################################# #use diagnostics; use strict; use Getopt::Std ; sub show_help { print STDERR < Available options : -h : show this help -o : specify the output file (matrix file) -i : specify the input file (alignment) EOF } ### Parse command line arguments my %option ; getopts ( "o:i:" , \%option ) ; if ( $option{h} ) { show_help() ; exit } unless ( $option{"i"} and $option{"o"} ) { show_help() ; exit } ### Name of file containing aligned sites my $alignment_file = $option{"i"} ; ### Name of output file my $outfile = $option{"o"} ; ### Array for storing contents of file my @read_file ; ### Store frequencies of each base at each position my (@A_freq, @C_freq, @G_freq, @T_freq, @X_freq ) ; ### Maximum allowed length of alignment my $max_seq_size = 100 ; ### Length of the alignment my $align_length = 0 ; print "Using alignment file: $alignment_file\n" ; print "Will write to matrix file: $outfile\n" ; ### Initialise frequency arrays with zeros for (my $i = 0 ; $i < $max_seq_size ; $i++ ) { $A_freq[$i] = 0 ; $C_freq[$i] = 0 ; $G_freq[$i] = 0 ; $T_freq[$i] = 0 ; $X_freq[$i] = 0 ; } ; ### Open file containing aligned site sequences open( ALIGNMENT_FILE, "<$alignment_file" ) and print "Opened sequence file: $alignment_file\n" or die "Could not open sequence file: $alignment_file\n" ; ### Load contents of alignment file into array while (my $read_line = ) { push @read_file, $read_line } ### Check that file is a Clustal alignment file my $read_line = shift @read_file ; unless ( $read_line =~ m/^CLUSTAL/ ) { die "Not a Clustal alignment file/n" } ### Process each line of the file my $number_of_sequences = 0 ; while ( $read_line = shift @read_file) { if ( $read_line =~ m/\s+([ACGTacgt]+)\s+/ ) { my $ln = incorporate_sequence($1) ; if ( $ln >> $align_length ) { $align_length = $ln } $number_of_sequences++; } } print_freqs () ; close ALIGNMENT_FILE ; exit ; sub incorporate_sequence { ### Expects a short DNA sequence string as the argument my ( $sequence ) = ( @_ ) ; ### Explode each letter of sequence into an array my @exploded_sequence = split ( '', $sequence) ; ### Print out the data that will be used my $length = @exploded_sequence ; print "Length:$length, sequence:$sequence $read_line" ; print "Exploded sequence:",(@exploded_sequence), "\n" ; ### Cycle through each letter in sequence and add to the tally my $i = 0 ; while ( my $letter = shift ( @exploded_sequence) ) { if ( $letter =~ m/[Aa]/ ) { $A_freq[$i]++ } elsif ( $letter =~ m/[Cc]/ ) { $C_freq[$i]++ } elsif ( $letter =~ m/[Gg]/ ) { $G_freq[$i]++ } elsif ( $letter =~ m/[TtUu]/ ) { $T_freq[$i]++ } elsif ( $letter =~ m/[^AaCcGgTtUu]/ ) { $X_freq[$i]++ } ; $i ++ ; } return ($length) ; } sub print_freqs { ### Print out results print "Number of sequences: $number_of_sequences\n" ; print "Length of alignment: $align_length\n" ; ### Print all values to screen (STDOUT) for ( my $i = 0 ; $i < $align_length ; $i++ ) { print "(A): $A_freq[$i]; " ; print "(C): $C_freq[$i]; " ; print "(G): $G_freq[$i]; " ; print "(T): $T_freq[$i]\n" ; } ; ### Open output file open(OUTFILE, ">$outfile") and print "Opened output file: $outfile\n" or die"Could not open output file: $outfile\n" ; ### Print all values to output file print OUTFILE "$align_length\n" ; print OUTFILE "$number_of_sequences\n" ; print OUTFILE "A: " ; for ( my $i = 0 ; $i < $align_length ; $i++ ) { print OUTFILE "$A_freq[$i] " ; } ; print OUTFILE "\n" ; print OUTFILE "C: " ; for ( my $i = 0 ; $i < $align_length ; $i++ ) { print OUTFILE "$C_freq[$i] " ; } ; print OUTFILE "\n" ; print OUTFILE "G: " ; for ( my $i = 0 ; $i < $align_length ; $i++ ) { print OUTFILE "$G_freq[$i] " ; } ; print OUTFILE "\n" ; print OUTFILE "T: " ; for ( my $i = 0 ; $i < $align_length ; $i++ ) { print OUTFILE "$T_freq[$i] " ; } ; print OUTFILE "\n" ; close OUTFILE ; return (-1) ; } ;