This file is indexed.

/usr/lib/python2.7/dist-packages/pbalign/tools/createChemistryHeader.py is in python-pbalign 0.2.0-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python
"""createChemistryHeader.py gets chemistry triple information for movies in
a BLASR-produced SAM file. It writes a new SAM header file that contains the
chemisty information. This header can be used with samtools reheader. Most
of the work is actually done by BasH5Reader.
"""
import argparse
import copy
import logging
import sys

import pysam

from pbcore.io import BasH5IO, FofnIO

log = logging.getLogger('main')

MOVIENAME_TAG = 'PU'

class ChemistryLoadingException(Exception):
    """Exception when chemistry lookup fails."""
    pass

def format_rgds_entries(rgds_entries):
    """Turn the RG DS dictionary into a list of strings that
    can be placed into a header object.
    """

    rgds_strings = {}
    for rg_id in rgds_entries:
        rgds_string = ("BINDINGKIT:{b};SEQUENCINGKIT:{s};"
                       "SOFTWAREVERSION:{v}"
                       .format(b=rgds_entries[rg_id][0],
                               s=rgds_entries[rg_id][1],
                               v=rgds_entries[rg_id][2]))
        rgds_strings[rg_id] = rgds_string
    return rgds_strings

def extend_header(old_header, new_rgds_strings):
    """Create a new SAM/BAM header, adding the RG descriptions to the
    old_header.
    """

    new_header = copy.deepcopy(old_header)

    for rg_entry in new_header['RG']:
        try:
            new_ds_string = new_rgds_strings[rg_entry['ID']]
        except KeyError:
            continue

        if 'DS' in rg_entry:
            rg_entry['DS'] += ';' + new_ds_string
        else:
            rg_entry['DS'] = new_ds_string

    return new_header

def get_chemistry_info(sam_header, input_filenames, fail_on_missing=False):
    """Get chemistry triple information for movies referenced in a SAM
    header.

    Args:
        sam_header: a pysam.Samfile.header, which is a multi-level dictionary.
                    Movie names are read from RG tags in this header.
        input_filenames: a list of bas, bax, or fofn filenames.
        fail_on_missing: if True, raise an exception if the chemistry
                         information for a movie in the header cannot be
                         found. If False, just log a warning.
    Returns:
        a list of strings that can be written as DS tags to RG entries in the
        header of a new SAM or BAM file. For example,
        ['BINDINGKIT:xxxx;SEQUENCINGKIT:yyyy;SOFTWAREVERSION:2.0']

    Raises:
        ChemistryLoadingException if chemistry information cannot be found
        for a movie in the header and fail_on_missing is True.
    """

    # First get the full list of ba[sx] files, reading through any fofn or xml
    # inputs
    bas_filenames = []
    for filename in input_filenames:
        bas_filenames.extend(FofnIO.enumeratePulseFiles(filename))

    # Then get the chemistry triple for each movie in the list of bas files
    triple_dict = {}
    for bas_filename in bas_filenames:
        bas_file = BasH5IO.BasH5Reader(bas_filename)
        movie_name = bas_file.movieName
        chem_triple = bas_file.chemistryBarcodeTriple
        triple_dict[movie_name] = chem_triple

    # Finally, find the movie names that appear in the header and create CO
    # lines with the chemistry triple
    if 'RG' not in sam_header:
        return []
    rgds_entries = {}
    for rg_entry in sam_header['RG']:
        rg_id = rg_entry['ID']
        rg_movie_name = rg_entry[MOVIENAME_TAG]

        try:
            rg_chem_triple = triple_dict[rg_movie_name]
            rgds_entries[rg_id] = rg_chem_triple
        except KeyError:
            err_msg = ("Cannot find chemistry information for movie {m}."
                       .format(m=rg_movie_name))
            if fail_on_missing:
                raise ChemistryLoadingException(err_msg)
            else:
                log.warning(err_msg)

    rgds_strings = format_rgds_entries(rgds_entries)

    return rgds_strings

def get_parser():
    """Return an ArgumentParser for pbcompress options."""

    desc = ("createChemistryHeader creates a SAM header that contains the "
            "chemistry information used by Quiver.")

    parser = argparse.ArgumentParser(
        prog='getChemistryHeader.py', description=desc,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--debug", help="Output detailed log information.",
                        action='store_true')

    def sam_or_bam_filename(val):
        """Check that val names a SAM or BAM file."""
        if not (val.endswith(".bam") or val.endswith(".sam")):
            raise argparse.ArgumentTypeError(
                "File must end with .sam or .bam. {f} doesn't "
                "end with either of those."
                .format(f=val))
        return val

    parser.add_argument(
        "input_alignment_file",
        help="A SAM or BAM file produced by BLASR.",
        type=sam_or_bam_filename)

    parser.add_argument(
        "output_header_file",
        help=("Name of the SAM or BAM header file that will be created with "
              "chemistry information loaded."),
        type=sam_or_bam_filename)

    parser.add_argument(
        "--bas_files",
        help=("The bas or bax files containing the reads that were aligned in "
              "the input_alignment_file. Also can be a fofn of bas or bax "
              "files."),
        nargs='+',
        required=True)

    return parser

def setup_log(alog, file_name=None, level=logging.DEBUG, str_formatter=None):
    """Util function for setting up logging."""

    if file_name is None:
        handler = logging.StreamHandler(sys.stderr)
    else:
        handler = logging.FileHandler(file_name)

    if str_formatter is None:
        str_formatter = ('[%(levelname)s] %(asctime)-15s '
                         '[%(name)s %(funcName)s %(lineno)d] %(message)s')

    formatter = logging.Formatter(str_formatter)
    handler.setFormatter(formatter)
    alog.addHandler(handler)
    alog.setLevel(level)

def main():
    """Entry point."""
    parser = get_parser()
    args = parser.parse_args()

    if args.debug:
        setup_log(log, level=logging.DEBUG)
    else:
        setup_log(log, level=logging.INFO)

    input_file = pysam.Samfile(args.input_alignment_file, 'r')
    input_header = input_file.header
    log.debug("Read header from {f}.".format(f=input_file.filename))

    chemistry_rgds_strings = get_chemistry_info(
        input_header, args.bas_files)

    new_header = extend_header(input_header, chemistry_rgds_strings)

    if args.output_header_file.endswith('.bam'):
        output_file = pysam.Samfile(args.output_header_file, 'wb',
                                    header=new_header)
    elif args.output_header_file.endswith('.sam'):
        output_file = pysam.Samfile(args.output_header_file, 'wh',
                                    header=new_header)

    output_file.close()

if __name__ == '__main__':
    main()