/usr/bin/altree-add-S

#!/usr/bin/perl

#Ce programme etiquette les haplotypes malades (ajout d'un G) ou témoins (ajout d'un C), en fonction d'un seuil de malade et témoins qui le porte.
#Le programme prend en entrée un fichier .paup, et redonne un autre .paup

use strict;
use diagnostics;
use warnings;
use Getopt::Long; # qw(:config permute);
use Pod::Usage;
#use Getopt::Std;

our($opt_h,$opt_i, $opt_o, $opt_e, $opt_p, $opt_t, $opt_l, $opt_j, 
    $opt_g, $opt_q);

our $VERSION;
$VERSION = sprintf "0.%03d", q$Revision: 427 $ =~ /(\d+)/g;

sub DefineAncDer {
    my $data_type=shift;
    if ($data_type == 0) {
	my $tem=0;
	my $mal=1;
	return ($tem, $mal);
    } elsif ($data_type == 1) {
	my $tem="C";
	my $mal="G";
	return ($tem, $mal);	
    }
}

sub ReadCorrespond 
{
    my($name_correspond) =shift;
    my $data_qual = shift;
    my($ligne, @tableau);
    my(%correspondance);
    open (CORRESP, '<', $name_correspond) || die "Unable to open file $name_correspond: $!\n";
    while ($ligne=<CORRESP>) {
	chomp($ligne);
	if ($ligne =~ /^$/) {
	    next;
	}

	####### Qualitative data ######
	if ($data_qual eq "quali") {
	    @tableau=split(/\s+/, $ligne);
	    if ($#tableau != 2) {
		die "error in $name_correspond: not 3 columns at line '$ligne'\n";
	    } else {
		$tableau[2]=~ s/c//;
		$tableau[1]=~ s/m//;
		if ($tableau[1] =~ /c/ || $tableau[2] =~ /m/) {
		    die "You have probably exchanged the order of cases and controls in file $name_correspond. It should be: haplo_name m_case_number c_control_number\n";
		}
		$correspondance{$tableau[0]}->{"case"}=$tableau[1]+0;
		$correspondance{$tableau[0]}->{"control"}=$tableau[2]+0;
		
	    }
	} else {
	    ##### Quantitative data #######
	    @tableau = split(/\s+/, $ligne);
	    my @quant_val = splice(@tableau, 1);
	    $correspondance{$tableau[0]}=\@quant_val;
	}
    }
    #my($clefs);
    #DEBUG
   # foreach $clefs (keys %correspondance) {
   # print "$clefs case: ", $correspondance{$clefs}->{"case"}, "\n";
   # 	print "$clefs, control: ",$correspondance{$clefs}->{"control"}, "\n";
   # }
    return(\%correspondance);
}

sub calcul_moyenne_variance
{
    my $correspondance = shift;
    
    my $somme=0;
    my $nb_val=0;
    my $somme_carres=0;
    foreach my $haplo (keys %{$correspondance}) {
	foreach my $elem (@{$correspondance->{$haplo}}) {
	    $nb_val++;
	    $somme+=$elem;
	    $somme_carres+=$elem*$elem;
	}
    }
    my $moyenne = $somme/$nb_val;
    my $variance = $somme_carres/$nb_val-$moyenne*$moyenne;
    return ($moyenne, sqrt($variance));
}

sub calcul_moyenne_nbind
{
    my $tableau = shift;
    
    my $somme = 0;
    my $nb_ind=scalar(@{$tableau});
    foreach my $elem (@{$tableau}) {
	$somme+=$elem;
    }
    return ($somme/$nb_ind, $nb_ind);
}

sub test_quanti_bilateral
{
    my $moy_gen = shift;
    my $ec_type_gen = shift;
    my $moy_loc = shift;
    my $nb_ind_loc = shift;
    my $epsilon = shift;

    if ($moy_loc > ($moy_gen + $epsilon*$ec_type_gen/sqrt($nb_ind_loc))) {
	return "eleve";
    } elsif ($moy_loc < ($moy_gen - $epsilon*$ec_type_gen/sqrt($nb_ind_loc))) {
	return "faible";
    } else {
	return "?";
    }

}

sub travail
{
    
    my($seuil)=shift;
    my($data_type)=shift;
    my($proportion_malades)=shift;
    my $low =shift;
    my $name_correspond=shift;
    my $outgroup = shift;
    my $data_qual = shift;
    my($ligne);
    my($temoin, $malade, $sequence, $nom, $debut, $ancetre);
    my($anc, $num_car, $prop_mal, $prop_tem);
    my(@tableau, @tab2);
    my($tem, $mal)=DefineAncDer($data_type);
    my($correspondance)=ReadCorrespond($name_correspond,$data_qual);

   #foreach my $clefs (keys %{$correspondance}) {
#	print "$clefs case ", $correspondance->{$clefs}->{"case"}, "\n";
#    	print "$clefs, control ",$correspondance->{$clefs}->{"control"}, "\n";
#    }
    my $found_outgroup=0;
    my $ici = 0;
    my $phylo_prog= "Phylip";
    my $compteur=0;
    while ($ligne=<STDIN>){
	chomp($ligne);
	if ($ligne =~/^$/) {
	    next;
	}
	$compteur++;
	my $diese='#';
	if ($ligne =~ /^\s*$diese[N|n]exus\s*$/) {
	    $phylo_prog="PAUP";
	} 	
	if ($phylo_prog eq "PAUP") {
	    if ($ligne =~ /^\s*matrix\s*$/) {
		print $ligne, "\n";
		$ici=1;
		next
	    }
	    if ($ligne =~ /^\s*;\s*$/) {
		
		$ici=0;
	    }
	    if ($ligne =~ /^\s*\[/) {
		next;
	    }
	} else {
	    if ($compteur>1) {
		$ici=1;
	    } 
	}
	    
	if ($ici==0 && $phylo_prog eq "PAUP") {
	    if ($ligne =~ /dimension ntax=([0-9]+)\s+nchar=([0-9]+);/) {
		$num_car=$2+1;
		print "dimension ntax=$1 nchar=", $num_car, ";\n";
	    }elsif ($ligne =~ /format symbols=\"([0-9ATGCU]+)\"/) {
		my($format)=$1;
		my($format_old)=$1;
		$format =~ tr/GC//d;
		$ligne =~ s/$format_old/${format}CG/;
		print $ligne, "\n";
	    } elsif ($ligne =~ /ancstates\s+\*anc\s+vector\s*=\s*([0-9ATCG]+)\s*;/) {
		$anc=$1;
		$anc=$anc."?";
		$ligne =~ s/$1/$anc/;
		print $ligne,"\n";
	    } elsif ($ligne=~ /begin paup;/) {
		print $ligne,"\n";
		print "exclude $num_car; \n";
	    } elsif ($ligne=~ /\s*describetrees/) {
		print "include $num_car;\n";
		print $ligne,"\n";
	    } elsif ($ligne =~ /^\s*([0-9]+)\s+([0-9]+)$/) {
		$num_car=$2+1;
		print "$1\t$num_car\n";
	    } else {
		print $ligne, "\n";
	    }
	} elsif ($ici==0 && $phylo_prog eq "Phylip") {
	    if ($ligne =~ /^\s*([0-9]+)\s+([0-9]+)\s*$/) {
		print $1, "\t", $2+1, "\n";
	    } else {
		die "Strange line $ligne in Phylip file\n";
	    }
	} elsif ($ici==1) {
	    if ($ligne =~ /^\s+$/) {
		next;
	    }  elsif ($ligne =~ /^\s*\[.+\]$/) {
	#	print STDERR "TTTTT\n";
		next;
	    } 	    else {
		@tableau=split(/\s+/, $ligne);
		$sequence=$tableau[1];
		$nom=$tableau[0];
		#DEBUG	print "$nom $outgroup\n";
		if ($nom eq $outgroup){
		    #	$ancetre=$sequence."?";
		    print "$nom  $sequence?\n";
		    $found_outgroup++;
		    next;
		}
		#    } else {
		#if ($debut =~ /^\s*H[0-9]{3}_m[0-9]{3}_c[0-9]{3}/) {
		#   @tab2=split(/_/,$debut);
		# print $tab2[0],"\n";
		#  $temoin=$tab2[2];
		# $temoin =~ s/c//;
		# print STDERR "temoin=$temoin\n";
		# $malade=$tab2[1];
		# $malade =~ s/m//;
		# if ($malade =~ /c/ || $temoin =~ /m/) {
		#	die "You have probably interverti cases and controls in file correspond.txt. It should be: haplo_name m_case_number c_control_number\n";
		#   }
#		if ($debut eq $anc_name){
#		    $found_anc++;
#		}	
		#$nom=$tableau[0];
		#	print "$nom\n";

		###### QUALITATIVE ######
		if ($data_qual eq "quali") {
		    if (not exists ($correspondance->{$nom})){
			print STDERR "$nom is not found in the file $name_correspond. Assuming it is the outgroup.\nThe number of cases and controls affected to this sequence is set to 0\n";
			$correspondance->{$nom}->{"case"}=0;
			$correspondance->{$nom}->{"control"}=0;  
		    }
		    $malade=$correspondance->{$nom}->{"case"};
		    $temoin=$correspondance->{$nom}->{"control"};
		    
		    if ($malade == 0 && $temoin == 0) {
			print STDERR "$nom is carried by 0 cases and 0 controls. The state ? has been attributed to the S character\n";
			$sequence.="?";
			print "$nom  ", $sequence, "\n";
			next;
		    }
		    
		    
		    #	print " $nom mal=$malade\n";   
		    $prop_mal=$malade/($malade+$temoin);
		    $prop_tem=$temoin/($malade+$temoin);
		    #	print "M=$malade T=$temoin\n";
		    #print "test=$test\n";
		    #	if ($test==0) { # test: difference |mal-tem| >=seuil 
		    #    if ($malade > $temoin && $malade-$temoin>=$seuil) {
		    #$sequence.="G";
		    #    } elsif ($malade < $temoin && $temoin-$malade>=$seuil) {
		    #	$sequence.="C";
		    #    } else {
		    #	$sequence.="?";
		    #    }
		    #} elsif ($test==1) {
		    #  if ($malade+$temoin==1) {
		    #$sequence.="?";
		    if ( $malade+$temoin <= $low) {
			$sequence.="?";
		    }  else {
			if ($prop_mal>$proportion_malades+
			    $seuil*sqrt($prop_mal*$prop_tem/($malade+$temoin))) {
			    $sequence.=$mal;
			} elsif ($prop_mal<$proportion_malades-
				 $seuil*sqrt($prop_mal*$prop_tem/($malade+$temoin))) {
			    $sequence.=$tem;
			} else {
			    $sequence.="?";
			}   
			#}
		    }
		    
		    print "$nom  ", $sequence, "\n";
		    #"_m$malade", "_c$temoin\t", $sequence, "\n";
		} else {
		    ###### QUANTITATIVE #######
		    if (not exists ($correspondance->{$nom})){
			print STDERR "$nom is not found in the file". 
			    "$name_correspond\n"; 	
		    }
		    my ($moyenne_gen, $ec_type) = calcul_moyenne_variance
			($correspondance);
		    my ($moyenne_loc, $nb_ind)= calcul_moyenne_nbind
			($correspondance->{$nom});
		    #DEBUG print "$nom\n";
		    #DEBUG print "Moyenne: $moyenne_gen\n";
		    #DEBUG print "Variance: $ec_type\n";
		    #DEBUG print "Moyenne locale: $moyenne_loc\n";
		    #DEBUG print "nb_ind: $nb_ind\n";
		    if ($nb_ind <= $low) {
			$sequence.="?";
		    }  else {
			if (test_quanti_bilateral($moyenne_gen, $ec_type, $moyenne_loc, $nb_ind, $seuil) eq "eleve") {
			    $sequence.=$mal;
			} elsif (test_quanti_bilateral($moyenne_gen, $ec_type, $moyenne_loc, $nb_ind, $seuil) eq "faible") {
			    $sequence.=$tem; 
			} else {
			    $sequence.="?";
			}
		    }
		    print "$nom  ", $sequence, "\n";
		}
	    } 
	}
    }
    #print "anc? $found_anc\n";
   if ($found_outgroup==0 && $outgroup ne "nooutgroup") {
	die "outgroup not found in the file\n";
    } elsif ($found_outgroup ==1 && $outgroup eq "noanc") {
	die " false outgroup found\n";
    } elsif ($found_outgroup > 1) {
	die "Too many outgroups found ($found_outgroup outgroup)";
    }
}

sub usage {
    my $msg =shift;
    my($progname) =shift;
    print STDERR "Error! ".$msg;
    print STDERR "usage :$progname [options]\n";
    print STDERR " Options :\n";
    print STDERR "    [-h]  this help\n";
    print STDERR "     -i   input file\n";
    print STDERR "     -j   input2 file (correspond.txt)\n";
    print STDERR "     -o   output file\n";
    print STDERR "     -t   data type: SNP or DNA\n";
# ancienne option -t test: 0= mal-tem>seuil 1= seuil proportion0+/-sqr(pq/n)]
    print STDERR "     -p   proportion of cases in the whole data set\n";
    print STDERR "     -e   epsilon parameter\n";
    print STDERR "     -q   quantitative or qualitative data\n";
    print STDERR "    [-g]  name of the outgroup\n"; 
    print STDERR "     -l   if an haplotype is present equal or less than -l times, the state of S will be set to ?\n"; 
}

sub main
{
    my($progname);
    my($seuil, $test, $proportion);
    
    my %options= (
    	"first-input-file" => \$opt_i,
	"second-input-file" => \$opt_j,
	"output-file" => \$opt_o,
	"epsilon" => \$opt_e,
	"data-type" => \$opt_t,
	"proportion" => \$opt_p,
	"outgroup" => \$opt_g,
	"low" => \$opt_l,
	"data-qual" => \$opt_q,
	);
    	
    #getopts('ho:i:j:e:t:p:l');
    GetOptions (\%options,
		"version",
		"short-help|h",
		"help",
		"man",
		"first-input-file|i=s",
                "second-input-file|j=s",
                "output-file|o=s",
		"epsilon|e=s",
		"data-type|t=s",
		"proportion|p=s",
		"outgroup|g=s",
		"low|l=i",
		"data-qual|q=s",
		) or pod2usage(2);
    if (defined($options{"version"})) {
	print $0, " version ", $VERSION, "\n";
	print "(Perl version ", $], ")\n";
	exit 0;
    }
    if (defined($options{"short-help"})) {
	pod2usage(-exitstatus => 0, -verbose => 0);
    }
    if (defined($options{"help"})) {
	pod2usage(-exitstatus => 0, -verbose => 1);
    }
    if (defined($options{"man"})) {
	pod2usage(-exitstatus => 0, -verbose => 2);
    }
    
    if ($opt_i) {
	open(STDIN, '<', $opt_i) or die "Impossible to open $opt_i : $!" ;
    }

    my $correspond_name;
    if ($opt_j) {
	$correspond_name=$opt_j;
    } else {
	$correspond_name="correspond.txt"
	}
    if ($opt_o) {
	open(STDOUT, '>', $opt_o) or die "Impossible to open $opt_o : $!" ;
    }
    if ($opt_e) {
	$seuil=$opt_e;
    } else {
	usage("The epsilon parameter is not defined!!\n", $progname);
    } 
    my($data_type);
    if (defined($opt_t)) {
	if ($opt_t =~ /[Dd][Nn][Aa]/) {
	    $data_type=1;
	} elsif ($opt_t =~ /[Ss][Nn][Ps]/) {
	    $data_type=0;
	} else {
	    usage("The data type (SNP or DNA) is missing\n", $progname);
	}
	# if (defined($opt_t)) {
	#	if ($opt_t==0) {
	#	    $test=0;
	#	} elsif ($opt_t==1) {
	#	    $test=1;
	#	} else {
	#	    $test=-999;
	#	    print STDERR "illegal value of opt_t\n";
	#	}
	#  } else {
	#	die "manque le numero du test: 0: mal-tem=seuil  1: seuil=P0+/-sqr(pq/n)!!\n";
	# }    
	
	my($low);
	# Si $low !=0, if only one case or one control, then the state of S is ?
	if (not defined $opt_l) {
	    die "The minimum number of haplotype should be set by -l\n"; 
	} else {
	    $low=$opt_l;
	}
	
	my $data_qual;
	if (!$opt_q) {
	    die "data-qual must be specified\n";
	} elsif ($opt_q eq "qualitative") {
	    $data_qual="quali";
	} elsif ($opt_q eq "quantitative") {
	    $data_qual="quanti";
	} else {
	    die "Invalid value for data-qual (opt_q)\n";
	}
	
	if ($data_qual eq "quali") { 
	    if ($opt_p) {
		$proportion=$opt_p;
	    } else {
		usage("The proportion of cases in the sample is missing!\n", $progname);
	    }
	}
	my $outgroup="nooutgroup";
	if ($opt_g) {
	    $outgroup = $opt_g;
	}    
	travail($seuil, $data_type,  $proportion, $low, $correspond_name, $outgroup, $data_qual);
	
    }
}


main;

__END__
    
=head1 NAME

altree-add-S - Title...

=head1 SYNOPSIS

altree-add-S [options]

 Options:
    --version             program version
    --short-help|h        brief help message
    --help                help message with options descriptions
    --man                 full documentation
    --first-input-file|i  input file 1
    --second-input-file|j input file 2 data concerning the trait (qualitative or quantitative)
    --output-file|o       output file
    --epsilon|e           epsilon value
    --data-type|t         data type: SNP or DNA
    --proportion|p        proportion of cases in the sample (for qualitative data only)
    --data-qual|q         data type: qualitative or quantitative
    [--outgroup|g]        name of the outgroup
    --low|l               if an haplotype is present equal or less than -l times, the state of S will be set to "?"
    

=head1 OPTIONS

=over 8

=item B<--version>

Print the program version and exits.

=item B<--short-help>

Print a brief help message and exits.

=item B<--help>

Print a help message with options descriptions and exits.

=item B<--man>

Prints the manual page and exits.

=item B<--first-input-file|i>

Input file 1 (paup or phylip file)

=item B<--second-input-file|j>

Input file 2, contains the number of times a given haplotypes is carried by  case and control individuals (qualitative data) or the quantitative values correqsponding to a givent haplotype (quantitative data)

=item B<--output-file|o>

Output file

=item B<--epsilon|e>

espilon parameter (see formula in the documentation)

=item B<--data-type|t>  

Type of data: DNA (ATGCU) or SNP (0-1)

=item B<--proportion|p>

Proportion of case individuals in the sample

=item B<--outgroup|g>

Name of the outgroup (if it is not in the file containing the number of cases and controls per haplotype)

=item B<--data-qual|q> 

Specify if the data are qualitative or quantitative

=item B<--low|l>

if an haplotype is present equal or less than -l times, the state of S will be set to "?"

=back

=head1 DESCRIPTION

B<This program> adds a new character (called "character S") to each haplotype in the input file according to the number of cases and controls carrying it.  

=cut
altree 1.3.1-5 / usr / bin / altree-add-S