/usr/bin/mipeCheckSanity

#!/usr/bin/perl

#    This library is free software; you can redistribute it and/or
#    modify it under the terms of the GNU Lesser General Public
#    License as published by the Free Software Foundation; either
#    version 2.1 of the License, or (at your option) any later version.
#
#    This library is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#    Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public
#    License along with this library ('COPYING'); if not, write to the Free Software
#    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

use strict;
#use warnings;
use XML::Twig;
use Data::Dumper;

=head1 NAME

mipeCheckSanity.pl - Check sanity of MIPE file, focussing on contents (vs 'form', that can be checked using xmllint)
  based on MIPE version v0.9
  arguments: * mipe_file
             * (optional) list of PCR IDs

=head1 SYNOPSIS

mipe2genotypes.pl your_file.mipe <pcr_id1> <pcr_id2>

=head1 ADDITIONAL INFO

See http://mipe.sourceforge.net

=head1 AUTHOR

Jan Aerts (jan.aerts@bbsrc.ac.uk)

=cut

my %amb_codes = ( M => ['A', 'C']
                , R => ['A', 'G']
		, W => ['A', 'T']
		, S => ['C', 'G']
		, Y => ['C', 'T']
		, K => ['G', 'T']
		, V => ['A', 'C', 'G']
		, H => ['A', 'C', 'T']
		, D => ['A', 'G', 'T']
		, B => ['C', 'G', 'T']
		, N => ['A', 'C', 'G', 'T']
		);


my ( $file, @pcr_ids ) = @ARGV;
if ( not defined $file ) { die "Please provide filename\n" };
my $twig = XML::Twig->new( TwigHandlers => { pcr => \&pcr }
                         , pretty_print => 'indented' );
$twig->parsefile($file);
exit;

sub pcr {
  my ( $twig, $pcr ) = @_;

  my $to_include = 0;
  my $pcr_id = $pcr->first_child('id')->text;
  if ( scalar @pcr_ids > 0 ) {
    $to_include = 0;
    foreach ( @pcr_ids ) {
      if ( $pcr_id =~ /$_/i ) {
        $to_include = 1;
      }
    }
  } else {
    $to_include = 1;
  }
  
  if ( $to_include ) {

    #####Check that there is a USE part
    if ( not defined $pcr->first_child('use') ) {
      print "WARNING\tCHECK01\t", $pcr->first_child('id')->text, "\tNo use part defined\n";
      next;
    }
  
  
    #####Check that snp_id in genotypes actually is defined as a SNP
    {
      my @snps = $pcr->first_child('use')->children('snp');
      my %snps;
      foreach my $snp ( @snps ) {
        $snps{$snp->first_child('id')->text} = 1;
      }
      my @samples = $pcr->first_child('use')->children('sample');
      foreach my $sample ( @samples ) {
        my @sample_snps = $sample->children('genotype');
        foreach my $sample_snp ( @sample_snps ) {
          if ( not defined $snps{$sample_snp->first_child('snp_id')->text} ) {
            print "ERROR\tCHECK02\t", $pcr_id, "\tSample ", $sample->first_child('id')->text, ": genotype SNP ID: ", $sample_snp->first_child('snp_id')->text, " not defined as SNP\n";
          }
        }
      }
    }
    
    #####Check if SBE primer sequence and SNP position are in concordance with piece of use seq
    {
      my $use_seq = $pcr->first_child('use')->first_child('seq')->text;
      my @snps = $pcr->first_child('use')->children('snp');
      foreach my $snp ( @snps ) {
        my $snp_pos = $snp->first_child('pos')->text;
        my @assays = $snp->children('assay');
        foreach my $assay ( @assays ) {
	  if ( (uc $assay->first_child('type')->text) eq 'SBE' ) {
            my $sbe_specific = $assay->first_child('specific')->text;
            my $sbe_strand = $assay->first_child('strand')->text;
            if ( uc $sbe_strand =~ /^F/ ) {
              if ( not uc $sbe_specific eq uc substr($use_seq, ($snp_pos-(length $sbe_specific)-1), length $sbe_specific) ) {
                print "ERROR\tCHECK03\t", $pcr->first_child('id')->text, "\tSBE_SEQ does not comply with SNP pos and USE SEQ\t", $assay->first_child('id')->text, "\tSBE=", $sbe_specific, "\tUSE=", substr($use_seq, ($snp_pos-(length $sbe_specific)), length $sbe_specific), "\n";
              }
            } else {
              $sbe_specific = reverse $sbe_specific;
              $sbe_specific =~ tr/ACGTacgt/TGCAtgca/;
              if ( not uc $sbe_specific eq uc substr($use_seq, $snp_pos, length $sbe_specific) ) {
                print "ERROR\tCHECK03\t", $pcr->first_child('id')->text, "\tSBE_SEQ does not comply with SNP pos and USE SEQ\t", $assay->first_child('id')->text, "\tSBE=", $sbe_specific, "\tUSE=", substr($use_seq, $snp_pos, length $sbe_specific), "\n";
              }
            }
          }
	}
      }
    }
    
    #####Check if PCR primers can be found back in DESIGN sequence
    {
      my $design = $pcr->first_child('design');
      my $primer1_seq = uc $design->first_child('primer1')->first_child('seq')->text;
      ( my $primer2_seq = reverse uc $design->first_child('primer2')->first_child('seq')->text ) =~ tr/ACGTacgt/TGCAtgca/;
      my $design_seq = uc $design->first_child('seq')->text;
      if ( not $design_seq =~ /$primer1_seq/ ) {
        print "ERROR\tCHECK04\t", $pcr->first_child('id')->text, "\tPRIMER1_SEQ cannot be found back in DESIGN_SEQ\t", $primer1_seq, "\n";
      }
      if ( not $design_seq =~ /$primer2_seq/ ) {
        print "ERROR\tCHECK04\t", $pcr->first_child('id')->text, "\tPRIMER2_SEQ cannot be found back in DESIGN_SEQ\t", $primer2_seq, "\n";
      }
    }
        
    #####Check if SNP ambiguity code is in concordance with USE sequence
    {
      my $use_seq = $pcr->first_child('use')->first_child('seq')->text;
      my @snps = $pcr->first_child('use')->children('snp');
      foreach my $snp ( @snps ) {
        if ( not defined $snp->first_child('amb') ) { next };
        my $snp_id = $snp->first_child('id')->text;
	my $snp_pos = $snp->first_child('pos')->text;
	my $snp_amb = $snp->first_child('amb')->text;
        my $use_allele = substr($use_seq, $snp_pos-1, 1);
        my $good = 0;
	if ( uc $use_allele eq uc $snp_amb ) {
	  $good = 1;
	}
	if ( uc $use_allele eq 'N' ) {
	  $good = 1;
	}
	foreach my $amb_allele ( @{$amb_codes{$snp_amb}} ) {
	  if ( uc $amb_allele eq uc $use_allele ) {
	    $good = 1;
	  }
	}
	if ( not $good ) {
	  print "ERROR\tCHECK05\t", $pcr->first_child('id')->text, "\t", $snp->first_child('id')->text, "\tSNP ambiguity code (=", $snp_amb, ") does not comply with allele at SNP position in USE seq (=", $use_allele, ") (check CHECK03)\n";
	}
      }
    }
    
    #####Check for empty elements
    {
      if ( $pcr->first_child('id')->text eq '' ) {
        print "ERROR\tCHECK06\tNo PCR ID\n";
      }

      my @descendants = $pcr->descendants;
      foreach my $descendant ( @descendants ) {
        if ( scalar $descendant->descendants == 0 ) {
          if ( $descendant->text eq '' ) {
	    print "WARNING\tCHECK06\tEmpty element in ", $pcr->first_child('id')->text, "\n";
	  }
	}
      }
    }
  }
}
mipe 1.1-3 / usr / bin / mipeCheckSanity