/usr/bin/rgfa-findcrisprs is in ruby-rgfa 1.3.1-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | #!/usr/bin/ruby
require "rgfatools"
# crisprs have a structure ARU1RU..RUnRB where |U|~|R| in [24..50]
$debugmode = false
$spacersonly = false
class RGFA
def find_crisprs(minrepeats=3,minlen=24,maxlen=50)
ls = {}
segment_names.each do |sn|
s = segment(sn)
s.cn = (s.coverage(unit_length: @default[:unit_length],
count_tag: @default[:count_tag])/2).round
end
output_segment_infos if $debugmode
maxvisits_global = {:B => {}, :E => {}}
segment_names.each do |sn|
s = segment(sn)
next if s.length < minlen or s.length > maxlen
next if s.cn < minrepeats
circles = {}
linear = {}
maxvisits = {}
[:B, :E].each do |rt|
maxvisits[rt] = maxvisits_global[rt].dup
maxvisits[rt][sn] ||= s.cn
circles[rt] = []
linear[rt] = []
segment_end = [s, rt].to_segment_end
links_of(segment_end).each do |l|
search_circle(segment_end.invert_end_type,
segment_end,
l,
maxvisits[rt],0,
minlen,
maxlen*2+s.length,
[segment_end],
circles[rt],
linear[rt])
end
if maxvisits[rt][sn.to_sym] > 0
multi = {:l => [], :c => []}
[[linear[rt],:l], [circles[rt],:c]].each do |paths, pt|
paths.each do |c|
min_mv = s.cn
upto = (pt == :l ? -1 : -2)
c[0..upto].each do |csn, et|
mv = maxvisits[rt][csn.to_sym]
if mv < min_mv
min_mv = mv
end
end
if min_mv > 0
min_mv.times { multi[pt] << c.dup }
c[0..upto].each do |csn, et|
maxvisits[rt][csn.to_sym] -= min_mv
end
end
end
end
circles[rt] += multi[:c]
linear[rt] += multi[:l]
end
end
n_paths = (circles[:E].size+circles[:B].size+
linear[:E].size+linear[:B].size)
if (circles[:E].size - circles[:B].size).abs > 1
next
end
if (linear[:E].size - linear[:B].size).abs > 0
next
end
if linear[:E].size != 1
next
end
merged_circles = []
circles[:E].each {|c|merged_circles << merge_crisprs_path(c,s,:E)}
before = merge_crisprs_path(linear[:B].first,s,:B)
after = merge_crisprs_path(linear[:E].first,s,:E)
next if merged_circles.size < minrepeats
maxvisits_global = maxvisits
instances = 1
possible_instances = 0
merged_circles.each do |seq|
if seq.length > s.length + minlen
possible_instances += 1
end
instances += 1
end
if $spacersonly
puts merged_circles.sort.map(&:upcase)
else
puts "CRISP signature found in segment #{s.name}"
puts
puts " Before: sequence = ...#{before[-50..-1]}"
puts
if possible_instances > 0
instances = "#{instances}..#{instances+possible_instances}"
end
puts " Repeat: instances = #{instances}; "+
"length = #{s.length};\t"+
"sequence = #{s.sequence}"
puts
puts " Spacers:"
asterisk = false
merged_circles.each_with_index do |seq, i|
if seq.length > s.length + minlen
str = "=#{s.length}+2*#{(seq.length.to_f - s.length)/2}"
asterisk = true
this_asterisk = true
else
str = ""
this_asterisk = false
end
puts " (#{i+1}#{this_asterisk ? "*" : ""})\t"+
"length = #{seq.length}#{str};\tsequence = #{seq}"
end
if asterisk
puts
puts " * = possibly containing inexact repeat instance"
end
puts
puts "After: sequence = #{after[0..49]}..."
end
end
end
private
def output_segment_infos
segment_names.each do |sn|
s = segment(sn)
puts "#{s.name}\t#{s.cn}\t"+
"#{neighbours([s.name,:B]).map{|nb|segment(nb.segment).cn}.inject(:+)}\t"+
"#{neighbours([s.name,:E]).map{|nb|segment(nb.segment).cn}.inject(:+)}\t"+
"#{links_of([s.name,:B]).size}\t"+
"#{links_of([s.name,:E]).size}\t"+
"#{s.KC}\t#{s.length}"
end
end
def merge_crisprs_path(segpath, repeat, repeat_end)
merged = create_merged_segment(segpath, merged_name: :short,
disable_tracking: true)[0]
sequence = merged.sequence[repeat.
sequence.length..-(1+repeat.sequence.length)]
sequence = sequence.rc if repeat_end == :B
return sequence
end
def search_circle(goal, from, l, maxvisits, dist, mindist,
maxdist, path, circles, linear)
dest = l.other_end(from)
dest.segment = segment(dest.segment)
maxvisits[dest.name] ||= dest.segment.cn
se = dest.invert_end_type
if dest == goal
return if dist < mindist
new_path = path.dup
new_path << se
new_path[0..-2].each {|x| maxvisits[x.name] -= 1}
circles << new_path
return
end
return if maxvisits[dest.name] == 0
return if path.any?{|x|x.name==dest.name}
new_path = path.dup
new_path << se
dist += dest.segment.length - l.overlap.first.len
if dist > maxdist
new_path = path.dup
new_path << se
new_path[0..-1].each {|x| maxvisits[x.name] -= 1}
linear << new_path
return
end
ls = links_of(se)
if ls.size == 0
new_path[0..-1].each {|x| maxvisits[x.name] -= 1}
linear << new_path
return
end
ls.each do |next_l|
next_dest = segment(next_l.other_end(se).segment)
maxvisits[next_dest.name] ||= next_dest.cn
next if maxvisits[next_dest.name] == 0
search_circle(goal,se,next_l,maxvisits,dist,mindist,maxdist,new_path,
circles,linear)
end
return
end
end
if (ARGV.size == 0)
STDERR.puts "Usage: #$0 <gfa>"
exit 1
end
gfa = RGFA.from_file(ARGV[0])
gfa.set_default_count_tag(:KC)
gfa.header.ks ||= gfa.segments[0].length + 1
gfa.set_count_unit_length(gfa.header.ks-1)
gfa.find_crisprs
|