/usr/bin/blast_top_hits is in seer 1.1.2-3.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | #!/usr/bin/perl -w
use strict;
use warnings;
use Getopt::Long;
use Text::CSV;
# Features
# BLAST and BLAT
# Exact/full query matches only
#
my $clean = 0;
my $tmp_sort = "tmp_blat.psl";
my $help_message = <<HELP;
Usage: blast_top_hits --input blast_out --mode blast <options>
Reduces a blast or blat output file to the top hit for each query sequence
only
Options
Required
--input BLAST output file in -m 6/-outfmt 6 or BLAT .psl file
--mode Either blast or blat
Optional
--full_query The entire query must be reported as a match to appear
in the output
-h, --help Shows this help.
HELP
#****************************************************************************************#
#* Main *#
#****************************************************************************************#
#* gets input parameters
my ($file_in, $mode, $full_query, $help);
GetOptions( "input|f=s" => \$file_in,
"mode=s" => \$mode,
"full_query" => \$full_query,
"help|h" => \$help
) or die($help_message);
# Basic error check on inputs
if (defined($help))
{
print STDERR $help_message;
}
elsif (!defined($file_in) || !-e $file_in)
{
print STDERR "Input file not found\n";
print STDERR $help_message;
}
elsif (!defined($mode))
{
print STDERR "Operation mode must be either blast or blat\n";
print STDERR $help_message;
}
else
{
my (@columns, $qseqid, $qstart, $qend, $length);
if ($mode =~ /blat/i)
{
# psl format
# match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStarts tStarts
# match match count bases count bases name size start end name size start end count
@columns = qw( match mismatch repmatch Ns Qgapcount Qgapbases Tgapcount Tgapbases strand Qname Qsize Qstart Qend Tname Tsize Tstart Tend blockcount blocksizes qstarts tstarts );
$mode = "blat";
$qseqid = "Qname";
$qstart = "Qstart";
$qend = "Qend";
$length = "match";
# Ensure sorted by query, then match score
system("sort -k10,10 -k1,1nr $file_in > $tmp_sort");
$file_in = $tmp_sort;
$clean = 1;
}
elsif ($mode =~ /blast/i)
{
# blast outfmt 6
#qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
@columns = qw( qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore );
$mode = "blast";
$qseqid = "qseqid";
$qstart = "qstart";
$qend = "qend";
$length = "length";
}
else
{
die("Operation mode $mode not supported. Use either blast or blat\n");
}
my $csv = Text::CSV->new ( { binary => 1, sep_char => "\t", auto_diag => 1, eol => $/ } ) # should set binary attribute.
or die "Cannot use CSV: ".Text::CSV->error_diag ();
my %rec;
$csv->bind_columns (\@rec{@columns});
open my $blast_in, "<:encoding(utf8)", "$file_in" or die "$file_in: $!";
my $previous_query = "";
while ($csv->getline($blast_in))
{
# Using speed advice from http://www.perlmonks.org/?node_id=937023
if ($rec{$qseqid} ne $previous_query)
{
if (!$full_query || ((($mode eq "blast" && $rec{$qstart} == 1) || $rec{$qstart} == 0) && $rec{$qend} == $rec{$length}))
{
$csv->print(*STDOUT, [ @rec{@columns} ]);
}
$previous_query = $rec{$qseqid};
}
}
if ($clean)
{
unlink $file_in;
}
}
exit(0);
|