This file is indexed.

/usr/lib/python2.7/dist-packages/macsypy/database.py is in macsyfinder 1.0.5-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# -*- coding: utf-8 -*-

################################################################################
# MacSyFinder - Detection of macromolecular systems in protein datasets        #
#               using systems modelling and similarity search.                 #
# Authors: Sophie Abby, Bertrand Néron                                         #
# Copyright © 2014  Institut Pasteur (Paris) and CNRS.                         #
# See the COPYRIGHT file for details                                           #
#                                                                              #
# MacsyFinder is distributed under the terms of the GNU General Public License #
# (GPLv3). See the COPYING file for details.                                   #
################################################################################




from itertools import groupby
from collections import namedtuple
from glob import glob
import os.path
import logging
_log = logging.getLogger('macsyfinder.' + __name__)
from subprocess import Popen
from .macsypy_error import MacsypyError


def fasta_iter(fasta_file):
    """
    :param fasta_file: the file containing all input sequences in fasta format.
    :type fasta_file: file object
    :author: http://biostar.stackexchange.com/users/36/brentp
    :return: for a given fasta file, it returns an iterator which yields tuples
             (string id, string comment, int sequence length)
    :rtype: iterator
    """
    # ditch the boolean (x[0]) and just keep the header or sequence since
    # we know they alternate.
    faiter = (x[1] for x in groupby(fasta_file, lambda line: line[0] == ">"))
    for header in faiter:
        # drop the ">"
        header = header.next()[1:].strip()
        header = header.split()
        _id = header[0]
        comment = ' '.join(header[1:])
        seq = ''.join(s.strip() for s in faiter.next())
        length = len(seq)
        yield _id, comment, length


class Indexes(object):
    """
    Handle the indexes for macsyfinder:

     - find the indexes for hmmer, or build them using formatdb or makeblastdb external tools
     - find the indexes required by macsyfinder to compute some scores, or build them.
    """


    def __init__(self, cfg):
        """
        The constructor retrieves the file of indexes in the case they are not present
        or the user asked for build indexes (--idx) 
        Launch the indexes building. 

        :param cfg: the configuration 
        :type cfg: :class:`macsypy.config.Config` object
        """
        self.cfg = cfg
        self._fasta_path = cfg.sequence_db
        self.name = os.path.basename(cfg.sequence_db)
        self._hmmer_indexes = None  # list of path
        self._my_indexes = None  # path

    def build(self, force=False):
        """
        Build the indexes from the sequence dataset in fasta format

        :param force: If True, force the index building even if the index files are present in the sequence dataset folder
        :type force: boolean
        """
        hmmer_indexes = self.find_hmmer_indexes()
        my_indexes = self.find_my_indexes()

        ###########################
        # build indexes if needed #
        ###########################
        index_dir = os.path.abspath(os.path.dirname(self.cfg.sequence_db))

        if force or not hmmer_indexes or not my_indexes:
            # formatdb create indexes in the same directory as the sequence_db
            # so it must be writable
            # if the directory is not writable, formatdb do a Segmentation fault
            if not os.access(index_dir, os.R_OK|os.W_OK):
                msg = "cannot build indexes, ({0}) is not writable".format(index_dir)
                _log.critical(msg)
                raise IOError(msg)

        if force or not hmmer_indexes:
            # self._build_hmmer_indexes() is asynchron
            hmmer_indexes_proc = self._build_hmmer_indexes()
        if force or not my_indexes:
            # self._build_my_indexes() is synchron
            self._build_my_indexes()

        ################################# 
        # synchronization point between #
        # hmmer_indexes and my_indexes  #
        #################################
        if force or not hmmer_indexes:
            hmmer_indexes_proc.wait()
            if hmmer_indexes_proc.returncode == 127:
                msg = "neither makeblastdb nor formatdb can be found, check your config or install makeblastb"
                _log.critical(msg, exc_info=True)
                raise RuntimeError(msg)
            if hmmer_indexes_proc.returncode != 0:
                msg = "an error occurred during databases indexation see formatdb.log"
                _log.critical(msg, exc_info=True)
                raise RuntimeError(msg)
        self._hmmer_indexes = self.find_hmmer_indexes()
        self._my_indexes = self.find_my_indexes()
        assert self._hmmer_indexes, "failed to create hmmer indexes"
        assert self._my_indexes, "failed create macsyfinder indexes"


    def find_hmmer_indexes(self):
        """
        :return: The hmmer index files. 
                 If indexes are inconsistent (some file(s) missing), a Runtime Error is raised
        :rtype: list of string 
        """
        suffixes = ('.phr', '.pin', '.psd', '.psi', '.psq', '.pal')
        idx = []
        file_nb = 0
        for suffix in suffixes:
            index_files = glob("{0}*{1}".format(self._fasta_path, suffix))
            nb_of_index = len(index_files)
            if suffix != '.pal':
                if file_nb and file_nb != nb_of_index:
                    msg = "some index files are missing.\
 Delete all index files (*.phr, *.pin, *.psd, *.psi, *.psq, *.pal) and try to rebuild them."
                    _log.critical(msg)
                    raise RuntimeError(msg)
            else:
                if nb_of_index > 1:
                    msg = "too many .pal file.\
 Delete all index files (*.phr, *.pin, *.psd, *.psi, *.psq, *.pal) and try to rebuild them."
                    _log.critical(msg)
                    raise RuntimeError(msg)
                elif file_nb > 1 and nb_of_index == 0:
                    msg = "some index files are missing.\
 Delete all index files (*.phr, *.pin, *.psd, *.psi, *.psq, *.pal) and try to rebuild them."
                    _log.critical(msg)
                    raise RuntimeError(msg)
                elif file_nb == 1 and nb_of_index == 1:
                    msg = "a virtual index is detected (.pal) but there is only one file per index type.\
 Delete all index files  (*.phr, *.pin, *.psd, *.psi, *.psq, *.pal) and try to rebuild them."
                    _log.critical(msg)
                    raise RuntimeError(msg)
            idx.extend(index_files)
            file_nb = nb_of_index
        return idx


    def find_my_indexes(self):
        """
        :return: the file of macsyfinder indexes if it exists in the dataset folder, None otherwise. 
        :rtype: string
        """ 
        path = os.path.join(os.path.dirname(self.cfg.sequence_db), self.name + ".idx")
        if os.path.exists(path):
            return path


    def _build_hmmer_indexes(self):
        """
        build the index files for hmmer using the formatdb or makeblastdb tool
        """
        index_dir = os.path.dirname(self.cfg.sequence_db)
        if self.cfg.index_db_exe.find('makeblast') != -1:
            command = "{0} -title {1} -in {2} -dbtype prot -parse_seqids".format(self.cfg.index_db_exe,
                                                                                 self.name,
                                                                                 self.cfg.sequence_db)
        elif self.cfg.index_db_exe.find('formatdb') != -1:
            # -t  Title for database file [String]
            # -i Input file(s) for formatting [File In]
            # -p T Type of file = protein
            # -o T Parse SeqId and create indexes.
            # -s T Create indexes limited only to accessions
            command = "{db_indexer} -t {db_name} -i {db_file} -p T -o T -s T".format(db_indexer=self.cfg.index_db_exe,
                                                                                     db_name=self.name,
                                                                                     db_file=self.cfg.sequence_db
                                                                                    )
        else:
            raise MacsypyError("{0} is not supported to index the sequence dataset.\
 Please use makeblastdb or formatdb.".format(self.cfg.sequence_db))

        _log.debug("hmmer index command: {0}".format(command))
        err_path = os.path.join(index_dir, "formatdb.err")
        with  open(err_path, 'w') as err_file:
            try:
                formatdb = Popen(command,
                                 shell=True,
                                 stdout=err_file,
                                 stdin=None,
                                 stderr=err_file,
                                 close_fds=False,
                                 )
            except Exception as err:
                msg = "unable to index the sequence dataset : {0} : {1}".format(command, err)
                _log.critical(msg, exc_info=True)
                raise err
            return formatdb


    def _build_my_indexes(self):
        """
        Build macsyfinder indexes. These indexes are stored in a file.

        The file format is the following:
         - one entry per line, with each line having this format:
         - sequence id;sequence length;sequence rank

        """
        try:
            with open(self._fasta_path, 'r') as fasta_file:
                with open(os.path.join(os.path.dirname(self.cfg.sequence_db), self.name + ".idx"), 'w') as my_base:
                    f_iter = fasta_iter(fasta_file)
                    seq_nb = 0
                    for seqid, comment, length in f_iter:
                        seq_nb += 1
                        my_base.write("{seq_id};{length:d};{seq_nb:d}\n".format(seq_id=seqid, length=length, seq_nb=seq_nb))
        except Exception as err:
            msg = "unable to index the sequence dataset: {0} : {1}".format(self.cfg.sequence_db, err)
            _log.critical(msg, exc_info=True)
            raise err


"""handle name, topology type, and min/max positions in the sequence dataset for a replicon"""
RepliconInfo = namedtuple('RepliconInfo', 'topology, min, max, genes')


class RepliconDB(object):
    """
    Stores information (topology, min, max, [genes]) for all replicons in the sequence_db
    the Replicon object must be instantiated only for sequence_db of type 'gembase' or 'ordered_replicon'
    """

    ordered_replicon_name = 'UserReplicon'

    def __init__(self, cfg):
        """
        :param cfg: The configuration object
        :type cfg: :class:`macsypy.config.Config` object

        .. note ::
            This class can be instanciated only if the db_type is 'gembase' or 'ordered_replicon' 
        """
        self.cfg = cfg
        assert self.cfg.db_type in ('gembase', 'ordered_replicon')
        idx = Indexes(self.cfg)
        self.sequence_idx = idx.find_my_indexes()
        self.topology_file = self.cfg.topology_file
        self._DB = {}
        if self.topology_file:
            topo_dict = self._fill_topology()
        else:
            topo_dict = {}
        if self.cfg.db_type == 'gembase':
            self._fill_gembase_min_max(topo_dict, default_topology=self.cfg.replicon_topology)
        else:
            self._fill_ordered_min_max(self.cfg.replicon_topology)


    def _fill_topology(self):
        """
        Fill the internal dictionary with min and max positions for each replicon_name of the sequence_db
        """
        topo_dict = {}
        with open(self.topology_file) as topo_f:
            for l in topo_f:
                if l.startswith('#'): 
                    continue
                replicon_name, topo = l.split(':')
                replicon_name = replicon_name.strip()
                topo = topo.strip().lower()
                topo_dict[replicon_name] = topo
        return topo_dict

    def _fill_ordered_min_max(self, default_topology=None):
        """
        For the replicon_name of the ordered_replicon sequence base, fill the internal dict with RepliconInfo

        :param default_topology: the topology provided by config.replicon_topology 
        :type default_topology: string
        """
        _min = 1
        # self.sequence_idx is a file with the following structure seq_id;seq_length;seq_rank\n
        with open(self.sequence_idx) as idx_f:
            _max = 0
            genes = []
            for l in idx_f:
                seq_id, length, _rank = l.split(";")
                genes.append((seq_id, length))
                _max += 1
            self._DB[self.ordered_replicon_name] = RepliconInfo(default_topology, _min, _max, genes)


    def _fill_gembase_min_max(self, topology, default_topology):
        """
        For each replicon_name of a gembase dataset, it fills the internal dictionary with a namedtuple RepliconInfo

        :param topology: the topologies for each replicon 
                         (parsed from the file specified with the option --topology-file)
        :type topology: dict
        :param default_topology: the topology provided by the config.replicon_topology 
        :type default_topology: string
        """
        def grp_replicon(line):
            """
            in gembase the identifier of fasta sequence follows the following schema: 
            <replicon-name>_<seq-name> with eventually '_' inside the <replicon_name>
            but not in the <seq-name>.
            so grp_replicon allow to group sequences belonging to the same replicon.
            """
            return "_".join(line.split('_')[: -1])

        def parse_entry(entry):
            """
            parse an entry in the index file (.idx)
            an entry have the following format sequence_id;sequence length;sequence rank in replicon
            """
            entry = entry.rstrip()
            seq_id, length, rank = entry.split(';')
            replicon_name = "_".join(seq_id.split('_')[: -1])
            seq_name = seq_id.split('_')[-1]
            return replicon_name, seq_name, length, int(rank)

        with open(self.sequence_idx) as idx_f:
            replicons = (x[1] for x in groupby(idx_f, grp_replicon))
            for replicon in replicons:
                genes = []
                entry = replicon.next()
                replicon_name, seq_name, seq_length, _min = parse_entry(entry)
                genes.append((seq_name, seq_length))
                for entry in replicon:
                    # pass all sequence of the replicon until the last one
                    _, seq_name, seq_length, _ = parse_entry(entry)
                    genes.append((seq_name, seq_length))
                _, seq_name, seq_length, _max = parse_entry(entry)
                genes.append((seq_name, seq_length))
                if replicon_name in topology:
                    self._DB[replicon_name] = RepliconInfo(topology[replicon_name], _min, _max, genes)
                else:
                    self._DB[replicon_name] = RepliconInfo(default_topology, _min, _max, genes)


    def __contains__(self, replicon_name):
        """
        :param replicon_name: the name of the replicon
        :type replicon_name: string
        :returns: True if replicon_name is in the repliconDB, false otherwise.
        :rtype: boolean
        """
        return replicon_name in self._DB


    def __getitem__(self, replicon_name):
        """
        :param replicon_name: the name of the replicon to get information on
        :type replicon_name: string
        :returns: the RepliconInfo for the provided replicon_name 
        :rtype: :class:`RepliconInfo` object
        :raise: KeyError if replicon_name is not in repliconDB
        """
        return self._DB[replicon_name]


    def get(self, replicon_name, default=None):
        """
        :param replicon_name: the name of the replicon to get informations
        :type replicon_name: string
        :param default: the value to return if the replicon_name is not in the RepliconDB
        :type default: any
        :returns: the RepliconInfo for replicon_name if replicon_name is in the repliconDB, else default.
        If default is not given, it is set to None, so that this method never raises a KeyError.
        :rtype: :class:`RepliconInfo` object
        """
        return self._DB.get(replicon_name, default)


    def items(self):
        """
        :return: a copy of the RepliconDB as a list of (replicon_name, RepliconInfo) pairs
        """
        return self._DB.items()


    def iteritems(self):
        """
        :return: an iterator over the RepliconDB as a list (replicon_name, RepliconInfo) pairs
        """
        return self._DB.iteritems()


    def replicon_names(self):
        """
        :return: a copy of the RepliconDB as a list of replicon_names
        """
        return self._DB.keys()


    def replicon_infos(self):
        """
        :return: a copy of the RepliconDB as list of replicons info
        :rtype: RepliconInfo instance
        """
        return self._DB.values()