/usr/bin/bp_nrdb is in bioperl 1.7.2-2.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | #!/usr/bin/perl
# Author Jason Stajich <jason-at-bioperl-dot-org>
#
# Make a non-redundant database based on sequence (not on ID!)
# This script is still in progress but is intended to mimic what
# Warren Gish's nrdb does
# It requires that Digest::MD5 is installed (for now)
=head1 NAME
bp_nrdb.PLS - a script to emulate Warren Gish's nrdb, make a unique sequence database from a set of input databases
=head1 SYNOPSIS
Usage:
bp_nrdb.PLS [options] file1 file2 file3
Alternative usage
bp_nrdb.PLS -p [options] file1 id1 file2 id2 file3 id3
=head1 DESCRIPTION
This script will create a unique database of sequences
(quasi-nonredundant). The options are:
-o filename - the filename the db is written (STDOUT by default)
-a filename - the filename to append the db to
-l# - minimum required sequence length
-i - do not check for duplicates
-n# - max number of descriptions to report per seq
-d# - delimiter to use between consecutive descriptions
-p - use database id prefixes from command line
=head1 AUTHOR
Jason Stajich, jason-at-bioperl-dot-org
=cut
use strict;
use warnings;
use Bio::SeqIO;
use Getopt::Long;
use Digest::MD5 qw(md5_hex);
my ($output,$append,$min_len,
$no_duplicate_check,$desc_count,
$delimiter, $expect_prefixes,$help);
$delimiter = ';';
GetOptions(
'o|output:s' => \$output,
'a|append:s' => \$append,
'n:s' => \$desc_count,
'l:s' => \$min_len,
'd:s' => \$delimiter,
'p' => \$expect_prefixes,
'i' => \$no_duplicate_check,
'h' => \$help,
);
die("must supply a positive integer for -d") if ( defined $desc_count &&
( $desc_count !~ /^\d+$/ ||
$desc_count < 1) );
die("must supply a positive integer for -l") if ( defined $min_len &&
( $min_len !~ /^\d+$/ ||
$min_len < 1) );
my @files;
if( $help || ! @ARGV ) {
exec('perldoc',$0);
exit(0);
}
while( @ARGV ) {
my ($file, $id) = (undef,'');
if( $expect_prefixes ) {
($file,$id) = (shift @ARGV, shift @ARGV);
if( ! $id ) {
die("Must provide 'name id' pairing of dbfile and id");
}
} else {
$file = shift @ARGV;
}
push @files, [ $file,$id];
}
my $out;
if( $append ) {
$out = new Bio::SeqIO(-file => ">>$append");
} elsif( $output ) {
$out = new Bio::SeqIO(-file => ">$output");
} else {
$out = new Bio::SeqIO(); # use STDOUT
}
my %unique;
my %seqcount;
my $counter = 0;
foreach my $pair ( @files ) {
my ($file,$id) = @$pair;
my $in = new Bio::SeqIO(-file => $file);
while( my $seq = $in->next_seq ) {
next if defined $min_len && $seq->length < $min_len;
if( $id ) {
$seq->display_id("$id:".$seq->display_id);
}
my $s = lc($seq->seq());
my $md5sum = md5_hex($s);
if( $no_duplicate_check ) {
$md5sum = $counter++;
}
if( defined $unique{$md5sum} ) {
$seqcount{$md5sum}++;
next if defined $desc_count && $seqcount{$md5sum++} > $desc_count;
my $desc = $unique{$md5sum}->description;
my $id2 = sprintf("%s %s:%s %s",$delimiter,
$id,$seq->display_id,$seq->description);
$unique{$md5sum}->desc($desc . $id2);
} else {
$unique{$md5sum} = $seq;
}
}
}
foreach my $seq ( values %unique ) {
$out->write_seq($seq);
}
__END__
|