This file is indexed.

/usr/bin/bp_search2gff is in bioperl 1.6.901-2.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell

# Author:      Jason Stajich <jason-at-bioperl-dot-org>
# Description: Turn SearchIO parseable report(s) into a GFF report
#
=head1 NAME

search2gff - Turn SearchIO parseable reports(s) into a GFF report

=head1 SYNOPSIS

Usage:
  search2gff [-o outputfile] [-f reportformat] [-i inputfilename]  OR file1 file2 ..

=head1 DESCRIPTION

This script will turn a protein Search report (BLASTP, FASTP, SSEARCH, 
AXT, WABA) into a GFF File.

The options are:

   -i infilename      - (optional) inputfilename, will read
                        either ARGV files or from STDIN
   -o filename        - the output filename [default STDOUT]
   -f format          - search result format (blast, fasta,waba,axt)
                        (ssearch is fasta format). default is blast.
   -t/--type seqtype  - if you want to see query or hit information
                        in the GFF report
   -s/--source        - specify the source (will be algorithm name
                        otherwise like BLASTN)
   --method           - the method tag (primary_tag) of the features
                        (default is similarity)
   --scorefunc        - a string or a file that when parsed evaluates
                        to a closure which will be passed a feature
                        object and that returns the score to be printed
   --locfunc          - a string or a file that when parsed evaluates
                        to a closure which will be passed two
                        features, query and hit, and returns the
                        location (Bio::LocationI compliant) for the
                        GFF3 feature created for each HSP; the closure
                        may use the clone_loc() and create_loc()
                        functions for convenience, see their PODs
   --onehsp           - only print the first HSP feature for each hit
   -p/--parent        - the parent to which HSP features should refer
                        if not the name of the hit or query (depending
                        on --type)
   --target/--notarget - whether to always add the Target tag or not
   -h                 - this help menu
   --version          - GFF version to use (put a 3 here to use gff 3)
   --component        - generate GFF component fields (chromosome)
   -m/--match         - generate a 'match' line which is a container
                        of all the similarity HSPs
   --addid            - add ID tag in the absence of --match
   -c/--cutoff        - specify an evalue cutoff

Additionally specify the filenames you want to process on the
command-line.  If no files are specified then STDIN input is assumed.
You specify this by doing: search2gff E<lt> file1 file2 file3

=head1 AUTHOR

Jason Stajich, jason-at-bioperl-dot-org

=head1 Contributors

Hilmar Lapp, hlapp-at-gmx-dot-net

=cut

use strict;
use Bio::Tools::GFF;
use Getopt::Long;
use Bio::SearchIO;
use Bio::Location::Simple; # pre-declare to simplify $locfunc implementations
use Bio::Location::Atomic; # pre-declare to simplify $locfunc implementations
use Storable qw(dclone);   # for cloning location objects
use Bio::Factory::FTLocationFactory;

my ($output,       # output file (if not stdout)
    $input,        # name of the input file
    $format,       # format of the input file, defauly is blast
    $type,         # 'query' or 'hit' 
    $cutoff,       # cut-off value for e-value filter
    $sourcetag,    # explicit source tag (will be taken from program
                   # otherwise
    $methodtag,    # primary tag (a.k.a. method), default 'similarity'
    $gffver,       # GFF version (dialect) to write  
    $scorefunc,    # closure returning the score for a passed feature
    $locfunc,      # closure returning a location object for a passed
                   # query and hit feature
    $addid,        # flag: whether to always add the ID for $match == 0
    $parent,       # the name of the parent to use; if set and $match == 0
                   # will always add the target
    $comp,         # flag: whether to print a component feature
    $addtarget,    # flag: whether to always add the Target tag, default
                   # is true
    $match,        # flag: whether to print match lines as containers
    $onehsp,       # flag: whether to consider only the first HSP for a hit
    $quiet,        # flag: run quietly
    $help);        # flag: show help screen

# set defaults:
$format    = 'blast'; 
$type      = 'query';
$gffver    = 2;
$methodtag = "similarity";
$addtarget = 1;

GetOptions(
	   'i|input:s'  => \$input,
	   'component'  => \$comp,
	   'm|match'    => \$match,
	   'o|output:s' => \$output,
	   'f|format:s' => \$format,
	   's|source:s' => \$sourcetag,
           'method=s'   => \$methodtag,
           'addid'      => \$addid,
           'scorefunc=s'=> \$scorefunc,
           'locfunc=s'  => \$locfunc,
           'p|parent=s' => \$parent,
           'target!'    => \$addtarget,
           'onehsp'     => \$onehsp,
	   't|type:s'   => \$type,
	   'c|cutoff:s' => \$cutoff,
	   'v|version:i'=> \$gffver,
           'q|quiet'    => \$quiet,
	   'h|help'     => sub{ exec('perldoc',$0);
				exit(0)
				},
	   );
$type = lc($type);
if( $type =~ /target/ ) { $type = 'hit' }
elsif( $type ne 'query' && $type ne 'hit' ) {
    die("seqtype must be either 'query' or 'hit'");
}

# custom or default function returning the score
$scorefunc = defined($scorefunc) ? parse_code($scorefunc) : sub {shift->score};

# custom or default function returning the location
$locfunc = defined($locfunc) ? parse_code($locfunc) : sub { shift->location };

# if --match is given then $addid needs to be disabled
$addid = undef if $addid && $match;

# if no input is provided STDIN will be used
my $parser = new Bio::SearchIO(-format => $format,
			       -verbose => $quiet ? -1 : 0,
			       -file   => $input);

my $out;
if( defined $output ) {
    $out = new Bio::Tools::GFF(-gff_version => $gffver,
			       -file => ">$output");
} else { 
    $out = new Bio::Tools::GFF(-gff_version => $gffver); # STDOUT
}
my (%seen_hit,%seen);
my $other = $type eq 'query' ? 'hit' : 'query';

while( my $result = $parser->next_result ) {
    my $qname = $result->query_name;
    if ( $comp && $type eq 'query' && 
	 $result->query_length ) {
	$out->write_feature(Bio::SeqFeature::Generic->new
			    (-start       => 1,
			     -end         => $result->query_length,
			     -seq_id      => $qname,
			     -source_tag  => 'chromosome',
			     -primary_tag => 'Component',
			     -tag         => {
				 'Sequence' => $qname
				 }));
    }
    while( my $hit = $result->next_hit ) {
	next if( defined $cutoff && $hit->significance > $cutoff);
	my $acc = $qname;
	if( $seen{$qname."-".$hit->name}++ ) {
	    $acc = $qname."-".$seen{$qname.'-'.$hit->name};
	}
	
	if( $comp && $type eq 'hit' && $hit->length &&
	    ! $seen_hit{$hit->name}++ ) {
	    $out->write_feature(Bio::SeqFeature::Generic->new
				(-start       => 1,
				 -end         => $hit->length,
				 -seq_id      => $hit->name,
				 -source_tag  => 'chromosome',
				 -primary_tag => 'Component',
				 -tag         => {
				     'Sequence' => $hit->name
				     }));
	}
	my (%min,%max,$seqid,$name,$st);
	while( my $hsp = $hit->next_hsp ) {
	    my $feature = new Bio::SeqFeature::Generic;
	    my ($proxyfor,$otherf);
	    if( $type eq 'query' ) {
		($proxyfor,$otherf) = ($hsp->query,
				      $hsp->hit);
		$name  ||= $hit->name;
	    } else {
		($otherf,$proxyfor) = ($hsp->query,
				      $hsp->hit);
		$name ||= $acc;
	    }
	    $proxyfor->score($hit->bits) unless( $proxyfor->score );
	    if (($gffver == 3) && ($match || $parent)) {
		$feature->add_tag_value('Parent', $parent || $name);
	    }
	    
	    $min{$type} = $proxyfor->start 
                unless defined $min{$type} && $min{$type} < $proxyfor->start;
	    $max{$type} = $proxyfor->end 
                unless defined $max{$type} && $max{$type} > $proxyfor->end;
	    $min{$other} = $otherf->start 
                unless defined $min{$other} && $min{$other} < $otherf->start;
	    $max{$other} = $otherf->end 
                unless defined $max{$other} && $max{$other} > $otherf->end;
	    if ($addtarget || $match) {
                $feature->add_tag_value('Target', 'Sequence:'.$name);
                $feature->add_tag_value('Target', $otherf->start);
                $feature->add_tag_value('Target', $otherf->end);
            }
            if ($addid) {
                $feature->add_tag_value('ID', $name);
            }

	    $feature->location(&$locfunc($proxyfor,$otherf));
	    #  strand for feature is always going to be product of
	    #  query & hit strands so that target can always be just
	    #  '+'
	    $feature->strand ( $proxyfor->strand * $otherf->strand);
	    if( $sourcetag ) { 
		$feature->source_tag($sourcetag);
	    } else {
		$feature->source_tag($proxyfor->source_tag);
	    }
	    $feature->score(&$scorefunc($proxyfor));
	    $feature->frame($proxyfor->frame);
	    $feature->seq_id($proxyfor->seq_id );
	    $feature->primary_tag($methodtag);
            # add annotation if encoded in the query description
            my $desc = $result->query_description;
            while ($desc =~ /\/([^=]+)=(\S+)/g) {
                $feature->add_tag_value($1,$2);
            }
	    $seqid ||= $proxyfor->seq_id;
	    $out->write_feature($feature);
	    $st ||= $sourcetag || $proxyfor->source_tag;
            last if $onehsp;
	}
	if( $match ) { 
	    
	    my $matchf = Bio::SeqFeature::Generic->new
		(-start => $min{$type},
		 -end   => $max{$type},
		 -strand=> $hit->strand($type)*$hit->strand($other),
		 -primary_tag => 'match',
		 -source_tag  => $st,
		 -score => $hit->bits,
		 -seq_id => $seqid);
	    if( $gffver == 3 ) { 
		$matchf->add_tag_value('ID', $name);
	    }
	    $matchf->add_tag_value('Target', "Sequence:$name");
	    $out->write_feature($matchf);
	}
    }
}

sub parse_code{
    my $src = shift;
    my $code;

    # file or subroutine?
    if(-r $src) {
        if(! (($code = do $src) && (ref($code) eq "CODE"))) {
            die "error in parsing code block $src: $@" if $@;
            die "unable to read file $src: $!" if $!;
            die "failed to run $src, or it failed to return a closure";
        }
    } else {
        $code = eval $src;
        die "error in parsing code block \"$src\": $@" if $@;
        die "\"$src\" fails to return a closure"
            unless ref($code) eq "CODE";
    }
    return $code;
}

=head2 clone_loc

 Title   : clone_loc
 Usage   : my $l = clone_loc($feature->location);
 Function: Helper function to simplify the task of cloning locations
           for --locfunc closures.

           Presently simply implemented using Storable::dclone().
 Example :
 Returns : A L<Bio::LocationI> object of the same type and with the
           same properties as the argument, but physically different.
           All structured properties will be cloned as well.
 Args    : A L<Bio::LocationI> compliant object


=cut

sub clone_loc{
    return dclone(shift);
}

=head2 create_loc

 Title   : create_loc
 Usage   : my $l = create_loc("10..12");
 Function: Helper function to simplify the task of creating locations
           for --locfunc closures. Creates a location from a feature-
           table formatted string.

 Example :
 Returns : A L<Bio::LocationI> object representing the location given
           as formatted string.
 Args    : A GenBank feature-table formatted string.


=cut

sub create_loc{
    return Bio::Factory::FTLocationFactory->from_string(shift);
}