This file is indexed.

/usr/share/augustus/scripts/gtf2gff.pl is in augustus 3.3+dfsg-2build1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
#!/usr/bin/perl
#
# format convert a gtf file
#
# Mario Stanke, 1.2.2010, mario.stanke@uni-greifswald.de
use strict;
use Getopt::Long;


my $help = 0;
my $verbose = 0;
my $printExon = 0;
my $printUTR = 0;
my $gff3 = 0;
my $printIntron = 0;
my $outfile;
my $includeStopInCDS = 0;

GetOptions(
    'out=s'=>\$outfile,
    'help!'=>\$help,
    'verbose!'=>\$verbose,
    'printExon!'=>\$printExon,
    'printUTR!'=>\$printUTR,
    'printIntron!'=>\$printIntron,
    'gff3!'=>\$gff3,
    'includeStopInCDS!'=>\$includeStopInCDS);

exec("perldoc $0") if ($help || !defined($outfile));

my @txorder = (); # tx ids in the order of the input file
my %geneOf = ();  # keys tx ids, values: gene ids
my %geneLine =  (); # keys gene ids, values: array refs for the gene GTF line (if exists)
# hash of transcripts
#    keys: transcript ids
#    values: hash reference
#            keys: txstart
#                  txend
#                  codingstart
#                  codingend
#                  strand, chr, source
#                  txline  array of columns if a transcript/mRNA line exists
#                  CDS  array of arrays (lines and columns) for coding parts of exons
#                  UTR  array of arrays for UTR exons
#                  exon array of arrays for complete exons
#                  intron array of arrays for introns
#                  rest array of arrays all other features, like tts,tss start_codon, stop_codon

my %txs = ();

parseAndStoreGTF();
convert();
open (OUT, ">$outfile") or die ("Could not open $outfile for writing.");
printConvertedGTF();
close OUT;

sub parseAndStoreGTF{
    my %seen = ();
    my ($txid, $geneid, $chr, $start, $end, $feature, $strand, $source, $stop_codon);
    foreach my $line (<STDIN>){
	my @f = split /\t/, $line;
	next if (@f<8);
	($chr,$source,$feature,$start,$end,$strand) = ($f[0],$f[1],$f[2],$f[3],$f[4],$f[6]);
	# check whether it is a line with 'gene' feature
	if ($f[2] eq "gene" && ($f[8] =~ /ID=([^;]+)/ || $f[8] =~ /gene_id."?([^";]+)"?/ || $f[8] =~ /^(\S+)$/)){
	    $geneid = $1;
	    $geneLine{$geneid} = \@f;
	    next;
	} 
	# check whether it is a line with 'transcript' feature
	if ($f[2] =~ "(transcript|mRNA)" && ($f[8] =~ /ID=([^;]+)/ || $f[8] =~ /transcript_id."?([^";]+)"?/ || $f[8] =~ /^(\S+)$/)){
	    $txid = $1;
	    $txs{$txid} = {"strand"=>$strand, "chr"=>$chr, "source"=>$source, "CDS"=>[], "UTR"=>[], "exon"=>[], "intron"=>[], "rest"=>[]} if (!exists($txs{$txid}));
	    $txs{$txid}{"txline"} = \@f;
	    if($f[8] =~ /Parent=([^;]+)/ || $f[8] =~ /gene_id."?([^";]+)"?/ || $f[8] =~ /(g\d+)\.t\d+/){
		$geneOf{$txid} = $1;
	    }
	    next;
	}
	
	$txs{$txid}{"CDS"} = [] if (!defined($txs{$txid}{"CDS"}));
	$txs{$txid}{"UTR"} = [] if (!defined($txs{$txid}{"UTR"}));
	$txs{$txid}{"exon"} = [] if (!defined($txs{$txid}{"exon"}));
	$txs{$txid}{"rest"} = [] if (!defined($txs{$txid}{"rest"}));

	# all other lines must belong to a transcript and a gene
	if ($f[8] =~ /(transcript_id|Transcript)."?([^";]+)"?/ ){
	    $txid = $2;
	} else {
	    if($f[8] =~ /Parent=([^;]+)/){
		$txid = $1;
	    }else{
		die ("Neither GTF nor GFF format in the following line:\n$line\ntranscript_id not found.\n");
	    }
	}
	if ($f[8] =~ /gene_id."?([^";]+)"?/){
	    $geneid = $1;
	} else {
	    if($f[8] =~ /Parent=([^;]+)/){
		$geneid = $geneOf{$1};
	    }else{
		die ("Neither GTF nor GFF format in the following line:\n$line\ngene_id not found.\n");
	    }
	}
	$txs{$txid} = {"strand"=>$strand, "chr"=>$chr, "source"=>$source, "CDS"=>[], "UTR"=>[], "exon"=>[], "intron"=>[], "rest"=>[]} if (!exists($txs{$txid}));

	if (!$seen{$txid}){
	    push @txorder, $txid; # remember the input order for transcripts for the output
	    $seen{$txid} = 1;
	}
	# assign parental gene id to tx id
	die ("transcript $txid has conflicting gene parents: and $geneid. Remember: In GTF txids need to be overall unique.")
	    if (defined($geneOf{$txid}) && $geneOf{$txid} ne $geneid);
	
	if ($feature eq "CDS" || $feature eq "coding_exon" || $feature eq "exon" || $feature =~ /UTR/i){
	    $txs{$txid} = {"strand"=>$strand, "chr"=>$chr, "source"=>$source, "CDS"=>[], "UTR"=>[], "exon"=>[], "intron"=>[], "rest"=>[]} if (!exists($txs{$txid}));
	    $txs{$txid}{"txstart"} = $start if (!defined($txs{$txid}{"txstart"}) || $txs{$txid}{"txstart"} > $start);
	    $txs{$txid}{"txend"} = $end if (!defined($txs{$txid}{"txend"}) || $txs{$txid}{"txend"} < $end);
	}
	if ($feature eq "CDS" || $feature eq "coding_exon"){
	    $txs{$txid}{"codingstart"} = $start if (!defined($txs{$txid}{"codingstart"}) || $txs{$txid}{"codingstart"} > $start);
	    $txs{$txid}{"codingend"} = $end if (!defined($txs{$txid}{"codingend"}) || $txs{$txid}{"codingend"} < $end);
	    push @{$txs{$txid}{"CDS"}}, \@f;
	} elsif ($feature =~ /UTR/i){
	    push @{$txs{$txid}{"UTR"}}, \@f;
	} elsif ($feature eq "exon"){
	    push @{$txs{$txid}{"exon"}}, \@f;
	} elsif ($feature eq "intron") {
	    $txs{$txid}{"intron"} = [] if (!defined($txs{$txid}{"intron"}));
            push @{$txs{$txid}{"intron"}}, \@f;
	} else {
	    push @{$txs{$txid}{"rest"}}, \@f;
	}
	if ($feature eq "stop_codon"){
	    $txs{$txid}{"stop_codon"} = $start
	}
    }
}


sub convert{
    my @f;
    foreach my $txid (keys %txs){
	
	# optionally, include stop codon in CDS
	if($includeStopInCDS){
	    my @cdslines = sort {$a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]} @{$txs{$txid}{"CDS"}};
	    if( $txs{$txid}{"strand"} eq '-' && $txs{$txid}{"stop_codon"}+3 == $txs{$txid}{"codingstart"}){
		$cdslines[0]->[3]-=3;
	    }
	    if( $txs{$txid}{"strand"} eq '+' && $txs{$txid}{"stop_codon"} == $txs{$txid}{"codingend"}+1){
		$cdslines[$#cdslines]->[4]+=3;
	    }
	    # TODO: if stop_codon is a separate CDS exon, then insert new CDS exon
	}
	
	# remember whether exons were not in the input file
	my $exonArrayWasEmpty;
	if(@{$txs{$txid}{"exon"}} == 0){
	    $exonArrayWasEmpty = 1;
	}else{
	    $exonArrayWasEmpty = 0;
	}
	# add exon lines if not already present and if desired in output
	if (($printExon || $printIntron) && @{$txs{$txid}{"exon"}} == 0){
	    print "Creating exon lines for $txid\n" if ($verbose);
	    # sort UTR and CDS lines by coordinates
	    my @exonpartlines = sort {$a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]} (@{$txs{$txid}{"CDS"}}, @{$txs{$txid}{"UTR"}});
	    next if (@exonpartlines == 0);
	    @f = @{$exonpartlines[0]};
	    shift @exonpartlines;
	    ($f[2], $f[5], $f[7]) = ("exon", '.', '.'); # score and frame are not defined
	    foreach my $g (@exonpartlines){
		if ($f[4] >= $g->[3]){ # check for non-overlappingness
		    die ("In transcript $txid two UTR/CDS features are overlapping. Not allowed by definition.");
		} elsif ($f[4] + 1 == $g->[3]){ # exactly adjacent
		    # join two UTR/CDS features to one
		    $f[4] = $g->[4];
		} else {
		    # push exon
		    my @ff = @f; # deep copy array
		    push @{$txs{$txid}{"exon"}}, \@ff;
		    @f = @$g;
		    ($f[2], $f[5], $f[7]) = ("exon", '.', '.'); # score and frame are not defined
		}
	    }
	    # push remaining, last exon
	    my @ff = @f;
	    push @{$txs{$txid}{"exon"}}, \@ff;
	}

	# add UTR lines if not already present and if desired in output
	if ($printUTR && @{$txs{$txid}{"UTR"}} == 0){
	    # sort exon and CDS lines start coordinates, "exon" feature comes before CDS feature at tie
	    my @epl = sort { return 1 if ($a->[3] > $b->[3]);
			     return -1 if ($a->[3] < $b->[3]);
			     return 1 if ($a->[2] ne "exon");
			     return -1 if ($b->[2] ne "exon");
			     return 0;} (@{$txs{$txid}{"CDS"}}, @{$txs{$txid}{"exon"}});
	    for (my $i=0; $i<@epl; $i++){
		next if ($epl[$i]->[2] ne "exon");
		if ($i < @epl-1 && $epl[$i+1]->[2] ne "exon" && $epl[$i+1]->[3] <= $epl[$i]->[4]){ # next feature is CDS and overlapping somehow
		    # --------------- exon
		    #    XXXXXXX CDS
		    # left UTR part
		    if ($epl[$i]->[3] < $epl[$i+1]->[3]){
			my @ff = @{$epl[$i]};
			$ff[4] = $epl[$i+1]->[3]-1; # ends 1 before CDS starts
			($ff[5], $ff[7]) = ('.', '.'); # score and frame are not defined
			$ff[2] = ($txs{$txid}{"strand"} eq '+')? "5'-UTR" : "3'-UTR";
			push @{$txs{$txid}{"UTR"}}, \@ff;
		    }
		    # right UTR part
		    if ($epl[$i+1]->[4] < $epl[$i]->[4]){
			my @ff = @{$epl[$i]};
			$ff[3] = $epl[$i+1]->[4]+1; # starts 1 after CDS starts
			($ff[5], $ff[7]) = ('.', '.'); # score and frame are not defined
			$ff[2] = ($txs{$txid}{"strand"} eq '+')? "3'-UTR" : "5'-UTR";
			push @{$txs{$txid}{"UTR"}}, \@ff;
		    }
		} else {
		    my @ff = @{$epl[$i]};
		    ($ff[5], $ff[7]) = ('.', '.'); # score and frame are not defined
		    if (($ff[3] < $txs{$txid}{"codingstart"} && $txs{$txid}{"strand"} eq "+") ||
			($ff[4] > $txs{$txid}{"codingend"} && $txs{$txid}{"strand"} eq "-")){
			$ff[2] = "5'-UTR";
		    } else {
			$ff[2] = "3'-UTR";
		    }
		    push @{$txs{$txid}{"UTR"}}, \@ff;
		}
	    }
	}
	# add intron lines if not already present and if desired in output                                                             
        if ($printIntron && @{$txs{$txid}{"intron"}} == 0){
	    my @exonlines =  sort {$a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]} (@{$txs{$txid}{"exon"}});
	    my @intronline = @{$exonlines[0]};
	    my $start = $intronline[4]+1;
	    my $end;
	    shift @exonlines;
	    foreach my $ex (@exonlines){
		$end = $ex->[3]-1;
		my @il = @intronline;
		($il[2], $il[3], $il[4], $il[5], $il[7]) = ("intron",$start,$end,'.','.');
		push @{$txs{$txid}{"intron"}}, \@il;
		$start = $ex->[4]+1;
	    }   
	    if(!$printExon && $exonArrayWasEmpty){
		@{$txs{$txid}{"exon"}} = ();
		
	    }
	}
    }
}


sub printConvertedGTF {
    my @lines;
    my %seen = ();
    my $geneid;
    foreach my $txid (@txorder){
	# print gene line before the first transcript of this gene
	$geneid = $geneOf{$txid};
	if (!$seen{$geneid} && defined($geneLine{$geneid})){
	    if ($gff3) {
		$geneLine{$geneid}->[8] = "ID=$geneid;\n";
	    }
	    print OUT join ("\t", @{$geneLine{$geneid}});
	    $seen{$geneid} = 1;
	}
	# print transcript line
	if ($txs{$txid}{"txline"}[0] ne ""){
	    if ($gff3) {
		$txs{$txid}{"txline"}->[2] = "mRNA";
		$txs{$txid}{"txline"}->[8] = "ID=$txid;Parent=$geneid\n";
	    }
	    print OUT join ("\t", @{$txs{$txid}{"txline"}});
	}
	# print all other lines
	@lines = sort {$a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]}
	(@{$txs{$txid}{"CDS"}}, @{$txs{$txid}{"UTR"}}, @{$txs{$txid}{"exon"}}, @{$txs{$txid}{"intron"}}, @{$txs{$txid}{"rest"}});
	#[PKRK] following variable are to make the CDS and exon ID unique within the scope of the gene model
	my $ct_exon = 0;
	my $ct_CDS = 0;
	my $ct_3UTR = 0;
	my $ct_5UTR = 0;
	foreach my $line (@lines){
	    if ($gff3){
		$line->[2] =~ s/3.*UTR/three_prime_utr/;
		$line->[2] =~ s/5.*UTR/five_prime_utr/;
		$line->[2] =~ s/tss/transcription_start_site/;
		$line->[2] =~ s/tts/transcription_end_site/;
		if($line->[2] eq "exon"){
		    ++$ct_exon;
		    $line->[8] = "ID=$txid.$line->[2]$ct_exon;Parent=$txid;\n";
		}elsif($line->[2] eq "CDS"){
		    ++$ct_CDS;
		   $line->[8] = "ID=$txid.$line->[2]$ct_CDS;Parent=$txid\n";
		}elsif($line->[2] eq "three_prime_utr"){
		    ++$ct_3UTR;
		    $line->[8] = "ID=$txid.3UTR$ct_3UTR;Parent=$txid\n";
		}elsif($line->[2] eq "five_prime_utr"){
		    ++$ct_5UTR;
		    $line->[8] = "ID=$txid.5UTR$ct_5UTR;Parent=$txid\n";
		} else {
		    $line->[8] = "Parent=$txid;\n";
		}
	    }
	    print OUT join("\t", @$line);
	}
    }
}

__END__

=pod

=head1 NAME

gtf2gff.pl      format convert a GTF file

=head1 SYNOPSIS

gtf2gff.pl <in.gtf --out=out.gff

    Besides easy by-line changes this script can in particular swap between the different representations of UTRs:
    a) explicit UTR lines (e.g. 3'-UTR or three_prime_utr)
    b) implicit specification by 'exon' and 'CDS' features
    
=head1 OPTIONS

  out                 gff output file
  --printExon         print exon features (may include CDS and UTR parts)
  --printUTR          print UTR features
  --printIntron       print intron features
  --gff3              output in gff3 format
  --includeStopInCDS  include stop codon in the CDS

=head1 DESCRIPTION
    
    example input:

    chr1 AUGUSTUS  gene        12656   14013   0.04    +   .   g50
    chr1 AUGUSTUS  transcript  12656   14013   0.04    +   .   g50.t1
    chr1 AUGUSTUS  tss         12656   12656   .       +   .   transcript_id "g50.t1"; gene_id "g50";
    chr1 AUGUSTUS  5'-UTR      12656   12867   0.2     +   .   transcript_id "g50.t1"; gene_id "g50";
    chr1 AUGUSTUS  start_codon 12868   12870   .       +   0   transcript_id "g50.t1"; gene_id "g50";
    chr1 AUGUSTUS  intron      12994   13248   1       +   .   transcript_id "g50.t1"; gene_id "g50";
    chr1 AUGUSTUS  CDS         12868   12993   0.8     +   0   transcript_id "g50.t1"; gene_id "g50";
    chr1 AUGUSTUS  CDS         13249   13479   1       +   0   transcript_id "g50.t1"; gene_id "g50";
    chr1 AUGUSTUS  stop_codon  13477   13479   .       +   0   transcript_id "g50.t1"; gene_id "g50";
    chr1 AUGUSTUS  3'-UTR      13480   14013   0.17    +   .   transcript_id "g50.t1"; gene_id "g50";
    chr1 AUGUSTUS  tts         14013   14013   .       +   .   transcript_id "g50.t1"; gene_id "g50";

    example output for --gff3 --printExon:

    chr1 AUGUSTUS  gene                     12656   14013   0.04    +   .   ID=g50;
    chr1 AUGUSTUS  mRNA                     12656   14013   0.04    +   .   ID=g50.t1;Parent=g50
    chr1 AUGUSTUS  transcription_start_site 12656   12656   .       +   .   Parent=g50.t1;
    chr1 AUGUSTUS  five_prime_utr           12656   12867   0.2     +   .   ID=g50.t1.5UTR1;Parent=g50.t1
    chr1 AUGUSTUS  exon                     12656   12993   .       +   .   ID=g50.t1.exon1;Parent=g50.t1;
    chr1 AUGUSTUS  start_codon              12868   12870   .       +   0   Parent=g50.t1;
    chr1 AUGUSTUS  CDS                      12868   12993   0.8     +   0   ID=g50.t1.CDS1;Parent=g50.t1
    chr1 AUGUSTUS  intron                   12994   13248   1       +   .   Parent=g50.t1;
    chr1 AUGUSTUS  CDS                      13249   13479   1       +   0   ID=g50.t1.CDS2;Parent=g50.t1
    chr1 AUGUSTUS  exon                     13249   14013   .       +   .   ID=g50.t1.exon2;Parent=g50.t1;
    chr1 AUGUSTUS  stop_codon               13477   13479   .       +   0   Parent=g50.t1;
    chr1 AUGUSTUS  three_prime_utr          13480   14013   0.17    +   .   ID=g50.t1.3UTR1;Parent=g50.t1
    chr1 AUGUSTUS  transcription_end_site   14013   14013   .       +   .   Parent=g50.t1;

=cut